[llvm] goldsteinn/dag shift flags (PR #91239)
via llvm-commits
llvm-commits at lists.llvm.org
Mon May 6 10:04:37 PDT 2024
https://github.com/goldsteinn created https://github.com/llvm/llvm-project/pull/91239
- **[CodeGen] Regen some old tests; NFC**
- **[DAGCombiner] Set shift flags during visit.**
>From 88dea6982defe98e475ac4cd8dd75ff6844ff704 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Mon, 6 May 2024 11:41:18 -0500
Subject: [PATCH 1/2] [CodeGen] Regen some old tests; NFC
---
.../AArch64/aarch64-address-type-promotion.ll | 14 +-
.../CodeGen/AArch64/arm64-narrow-st-merge.ll | 125 +
llvm/test/CodeGen/AArch64/bswap-known-bits.ll | 8 +-
...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 10 +-
.../AArch64/pull-binop-through-shift.ll | 8 +-
llvm/test/CodeGen/AArch64/shift-mod.ll | 4 +-
...vector_splat-const-shift-of-constmasked.ll | 90 +-
llvm/test/CodeGen/AMDGPU/build_vector.ll | 373 +
llvm/test/CodeGen/AMDGPU/fneg.ll | 657 ++
.../AMDGPU/kernel-argument-dag-lowering.ll | 332 +
.../CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll | 57 +
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll | 3 +
.../AMDGPU/llvm.amdgcn.struct.buffer.load.ll | 156 +
.../llvm.amdgcn.struct.ptr.buffer.load.ll | 156 +
.../AMDGPU/llvm.r600.read.local.size.ll | 343 +
llvm/test/CodeGen/AMDGPU/scratch-simple.ll | 7168 +++++++++++++++++
llvm/test/CodeGen/AMDGPU/sext-in-reg.ll | 2253 ++++++
llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll | 427 +
.../test/CodeGen/AMDGPU/shl-add-to-add-shl.ll | 31 +
llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll | 51 +
llvm/test/CodeGen/AMDGPU/store-private.ll | 1733 ++++
llvm/test/CodeGen/ARM/Windows/alloca.ll | 3 +
llvm/test/CodeGen/ARM/Windows/vla.ll | 4 +
llvm/test/CodeGen/ARM/and-cmpz.ll | 119 +
llvm/test/CodeGen/ARM/bfx.ll | 49 +-
llvm/test/CodeGen/ARM/sbfx.ll | 41 +-
llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll | 84 +-
llvm/test/CodeGen/ARM/shift-combine.ll | 127 +
llvm/test/CodeGen/BPF/remove_truncate_9.ll | 3 +
llvm/test/CodeGen/Mips/cins.ll | 49 +-
llvm/test/CodeGen/Mips/fabs.ll | 17 +-
llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll | 107 +-
llvm/test/CodeGen/Mips/fcopysign.ll | 123 +-
llvm/test/CodeGen/Mips/llvm-ir/abs.ll | 125 +-
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 219 +
llvm/test/CodeGen/NVPTX/mulwide.ll | 24 +-
.../NVPTX/unaligned-param-load-store.ll | 3 +
llvm/test/CodeGen/PowerPC/coalesce-ext.ll | 13 +-
llvm/test/CodeGen/PowerPC/extsh.ll | 1 +
llvm/test/CodeGen/PowerPC/shl_sext.ll | 1 +
llvm/test/CodeGen/SystemZ/int-abs-01.ll | 61 +-
llvm/test/CodeGen/SystemZ/int-cmp-44.ll | 466 +-
llvm/test/CodeGen/SystemZ/int-mul-10.ll | 41 +-
llvm/test/CodeGen/SystemZ/int-neg-02.ll | 86 +-
llvm/test/CodeGen/Thumb2/bfx.ll | 19 +-
llvm/test/CodeGen/VE/Scalar/bitreverse.ll | 1 +
llvm/test/CodeGen/WebAssembly/conv.ll | 3 +
.../CodeGen/WebAssembly/simd-sext-inreg.ll | 5 +
llvm/test/CodeGen/X86/lvi-hardening-loads.ll | 196 +-
llvm/test/CodeGen/X86/sext-subreg.ll | 13 +-
llvm/test/CodeGen/X86/x86-64-extend-shift.ll | 7 +-
51 files changed, 15408 insertions(+), 601 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll b/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
index d8280dadc550ea..e14618251b6d7d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -o - | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
@@ -5,13 +6,14 @@ target triple = "arm64-apple-macosx10.9"
; Check that sexts get promoted above adds.
define void @foo(ptr nocapture %a, i32 %i) {
+; CHECK-LABEL: foo:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-NEXT: ldp w9, w10, [x8, #4]
+; CHECK-NEXT: add w9, w10, w9
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: _foo:
-; CHECK: add
-; CHECK-NEXT: ldp
-; CHECK-NEXT: add
-; CHECK-NEXT: str
-; CHECK-NEXT: ret
%add = add nsw i32 %i, 1
%idxprom = sext i32 %add to i64
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
diff --git a/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll b/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll
index 81c3195584701c..01ad14b6fba52a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple aarch64 -verify-machineinstrs | FileCheck %s
; RUN: llc < %s -mtriple aarch64 -mattr=+strict-align -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-STRICT
@@ -7,6 +8,19 @@
; CHECK-STRICT: strh wzr
; CHECK-STRICT: strh wzr
define void @Strh_zero(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Strh_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sbfiz x8, x1, #1, #32
+; CHECK-NEXT: str wzr, [x0, x8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Strh_zero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-STRICT-NEXT: strh wzr, [x8]
+; CHECK-STRICT-NEXT: strh wzr, [x8, #2]
+; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
%arrayidx = getelementptr inbounds i16, ptr %P, i64 %idxprom
@@ -26,6 +40,21 @@ entry:
; CHECK-STRICT: strh wzr
; CHECK-STRICT: strh wzr
define void @Strh_zero_4(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Strh_zero_4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sbfiz x8, x1, #1, #32
+; CHECK-NEXT: str xzr, [x0, x8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Strh_zero_4:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-STRICT-NEXT: strh wzr, [x8]
+; CHECK-STRICT-NEXT: strh wzr, [x8, #2]
+; CHECK-STRICT-NEXT: strh wzr, [x8, #4]
+; CHECK-STRICT-NEXT: strh wzr, [x8, #6]
+; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
%arrayidx = getelementptr inbounds i16, ptr %P, i64 %idxprom
@@ -50,6 +79,18 @@ entry:
; CHECK-STRICT-LABEL: Strw_zero
; CHECK-STRICT: stp wzr, wzr
define void @Strw_zero(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Strw_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sbfiz x8, x1, #2, #32
+; CHECK-NEXT: str xzr, [x0, x8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Strw_zero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8]
+; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
%arrayidx = getelementptr inbounds i32, ptr %P, i64 %idxprom
@@ -64,6 +105,17 @@ entry:
; CHECK-LABEL: Strw_zero_nonzero
; CHECK: stp wzr, w1
define void @Strw_zero_nonzero(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Strw_zero_nonzero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-NEXT: stp wzr, w1, [x8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Strw_zero_nonzero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-STRICT-NEXT: stp wzr, w1, [x8]
+; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
%arrayidx = getelementptr inbounds i32, ptr %P, i64 %idxprom
@@ -81,6 +133,18 @@ entry:
; CHECK-STRICT: stp wzr, wzr
; CHECK-STRICT: stp wzr, wzr
define void @Strw_zero_4(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Strw_zero_4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-NEXT: stp xzr, xzr, [x8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Strw_zero_4:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8]
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #8]
+; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
%arrayidx = getelementptr inbounds i32, ptr %P, i64 %idxprom
@@ -106,6 +170,18 @@ entry:
; CHECK-STRICT: sturb wzr
; CHECK-STRICT: sturb wzr
define void @Sturb_zero(ptr nocapture %P, i32 %n) #0 {
+; CHECK-LABEL: Sturb_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw
+; CHECK-NEXT: sturh wzr, [x8, #-2]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Sturb_zero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw
+; CHECK-STRICT-NEXT: sturb wzr, [x8, #-2]
+; CHECK-STRICT-NEXT: sturb wzr, [x8, #-1]
+; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -2
%idxprom = sext i32 %sub to i64
@@ -124,6 +200,18 @@ entry:
; CHECK-STRICT: sturh wzr
; CHECK-STRICT: sturh wzr
define void @Sturh_zero(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Sturh_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-NEXT: stur wzr, [x8, #-6]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Sturh_zero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-4]
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-6]
+; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -2
%idxprom = sext i32 %sub to i64
@@ -144,6 +232,20 @@ entry:
; CHECK-STRICT: sturh wzr
; CHECK-STRICT: sturh wzr
define void @Sturh_zero_4(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Sturh_zero_4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-NEXT: stur xzr, [x8, #-8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Sturh_zero_4:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-6]
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-8]
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-4]
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-2]
+; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -3
%idxprom = sext i32 %sub to i64
@@ -169,6 +271,17 @@ entry:
; CHECK-STRICT-LABEL: Sturw_zero
; CHECK-STRICT: stp wzr, wzr
define void @Sturw_zero(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Sturw_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-NEXT: stur xzr, [x8, #-16]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Sturw_zero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #-16]
+; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -3
%idxprom = sext i32 %sub to i64
@@ -187,6 +300,18 @@ entry:
; CHECK-STRICT: stp wzr, wzr
; CHECK-STRICT: stp wzr, wzr
define void @Sturw_zero_4(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Sturw_zero_4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-NEXT: stp xzr, xzr, [x8, #-16]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Sturw_zero_4:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #-16]
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #-8]
+; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -3
%idxprom = sext i32 %sub to i64
diff --git a/llvm/test/CodeGen/AArch64/bswap-known-bits.ll b/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
index 23619e47367d01..f13ef52f94a414 100644
--- a/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
+++ b/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
@@ -8,7 +8,7 @@ declare i64 @llvm.bswap.i64(i64)
define i1 @test1(i16 %arg) {
; CHECK-LABEL: test1:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mov w0, #1
+; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: ret
%a = or i16 %arg, 511
%b = call i16 @llvm.bswap.i16(i16 %a)
@@ -20,7 +20,7 @@ define i1 @test1(i16 %arg) {
define i1 @test2(i16 %arg) {
; CHECK-LABEL: test2:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mov w0, #1
+; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: ret
%a = or i16 %arg, 1
%b = call i16 @llvm.bswap.i16(i16 %a)
@@ -32,7 +32,7 @@ define i1 @test2(i16 %arg) {
define i1 @test3(i16 %arg) {
; CHECK-LABEL: test3:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mov w0, #1
+; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: ret
%a = or i16 %arg, 256
%b = call i16 @llvm.bswap.i16(i16 %a)
@@ -44,7 +44,7 @@ define i1 @test3(i16 %arg) {
define i1 @test4(i32 %arg) {
; CHECK-LABEL: test4:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mov w0, #1
+; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: ret
%a = or i32 %arg, 2147483647 ; i32_MAX
%b = call i32 @llvm.bswap.i32(i32 %a)
diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 32a62453202f40..60ceaf19731921 100644
--- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -289,7 +289,7 @@ define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
define i1 @scalar_i32_x_is_const_eq(i32 %y) nounwind {
; CHECK-LABEL: scalar_i32_x_is_const_eq:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #43605
+; CHECK-NEXT: mov w8, #43605 // =0xaa55
; CHECK-NEXT: movk w8, #43605, lsl #16
; CHECK-NEXT: lsl w8, w8, w0
; CHECK-NEXT: tst w8, #0x1
@@ -303,8 +303,8 @@ define i1 @scalar_i32_x_is_const_eq(i32 %y) nounwind {
define i1 @scalar_i32_x_is_const2_eq(i32 %y) nounwind {
; CHECK-LABEL: scalar_i32_x_is_const2_eq:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #1
-; CHECK-NEXT: mov w9, #43605
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: mov w9, #43605 // =0xaa55
; CHECK-NEXT: lsl w8, w8, w0
; CHECK-NEXT: movk w9, #43605, lsl #16
; CHECK-NEXT: tst w8, w9
@@ -319,7 +319,7 @@ define i1 @scalar_i32_x_is_const2_eq(i32 %y) nounwind {
define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
; CHECK-LABEL: scalar_i8_bitsinmiddle_slt:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #24
+; CHECK-NEXT: mov w8, #24 // =0x18
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: lsl w8, w8, w1
; CHECK-NEXT: and w8, w8, w0
@@ -334,7 +334,7 @@ define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind {
; CHECK-LABEL: scalar_i8_signbit_eq_with_nonzero:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-128
+; CHECK-NEXT: mov w8, #-128 // =0xffffff80
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: lsl w8, w8, w1
; CHECK-NEXT: and w8, w8, w0
diff --git a/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll b/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll
index b3fbe8bdb6e308..a892bb85692d3e 100644
--- a/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll
+++ b/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll
@@ -81,7 +81,7 @@ define i32 @xor_nosignbit_shl(i32 %x, ptr %dst) {
define i32 @add_signbit_shl(i32 %x, ptr %dst) {
; CHECK-LABEL: add_signbit_shl:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-16777216
+; CHECK-NEXT: mov w8, #-16777216 // =0xff000000
; CHECK-NEXT: add w0, w8, w0, lsl #8
; CHECK-NEXT: str w0, [x1]
; CHECK-NEXT: ret
@@ -93,7 +93,7 @@ define i32 @add_signbit_shl(i32 %x, ptr %dst) {
define i32 @add_nosignbit_shl(i32 %x, ptr %dst) {
; CHECK-LABEL: add_nosignbit_shl:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-16777216
+; CHECK-NEXT: mov w8, #-16777216 // =0xff000000
; CHECK-NEXT: add w0, w8, w0, lsl #8
; CHECK-NEXT: str w0, [x1]
; CHECK-NEXT: ret
@@ -195,7 +195,7 @@ define i32 @add_signbit_lshr(i32 %x, ptr %dst) {
define i32 @add_nosignbit_lshr(i32 %x, ptr %dst) {
; CHECK-LABEL: add_nosignbit_lshr:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147418112
+; CHECK-NEXT: mov w8, #2147418112 // =0x7fff0000
; CHECK-NEXT: add w8, w0, w8
; CHECK-NEXT: lsr w0, w8, #8
; CHECK-NEXT: str w0, [x1]
@@ -298,7 +298,7 @@ define i32 @add_signbit_ashr(i32 %x, ptr %dst) {
define i32 @add_nosignbit_ashr(i32 %x, ptr %dst) {
; CHECK-LABEL: add_nosignbit_ashr:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147418112
+; CHECK-NEXT: mov w8, #2147418112 // =0x7fff0000
; CHECK-NEXT: add w8, w0, w8
; CHECK-NEXT: asr w0, w8, #8
; CHECK-NEXT: str w0, [x1]
diff --git a/llvm/test/CodeGen/AArch64/shift-mod.ll b/llvm/test/CodeGen/AArch64/shift-mod.ll
index a90603195cf348..ac95b75168ed98 100644
--- a/llvm/test/CodeGen/AArch64/shift-mod.ll
+++ b/llvm/test/CodeGen/AArch64/shift-mod.ll
@@ -127,7 +127,7 @@ define i64 @ashr_add_shl_i36(i64 %r) {
define i64 @ashr_add_shl_mismatch_shifts1(i64 %r) {
; CHECK-LABEL: ashr_add_shl_mismatch_shifts1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #4294967296
+; CHECK-NEXT: mov x8, #4294967296 // =0x100000000
; CHECK-NEXT: add x8, x8, x0, lsl #8
; CHECK-NEXT: asr x0, x8, #32
; CHECK-NEXT: ret
@@ -140,7 +140,7 @@ define i64 @ashr_add_shl_mismatch_shifts1(i64 %r) {
define i64 @ashr_add_shl_mismatch_shifts2(i64 %r) {
; CHECK-LABEL: ashr_add_shl_mismatch_shifts2:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #4294967296
+; CHECK-NEXT: mov x8, #4294967296 // =0x100000000
; CHECK-NEXT: add x8, x8, x0, lsr #8
; CHECK-NEXT: lsr x0, x8, #8
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll
index 7e958b266846a1..6525d6cd7458b5 100644
--- a/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll
@@ -328,7 +328,7 @@ define <8 x i16> @test_128_i16_x_8_127_mask_lshr_1(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_3(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_lshr_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #3
@@ -340,7 +340,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_3(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_4(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_lshr_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #4
@@ -352,7 +352,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_4(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_5(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_lshr_5:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #5
@@ -364,7 +364,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_5(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_6(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_lshr_6:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #6
@@ -432,7 +432,7 @@ define <8 x i16> @test_128_i16_x_8_127_mask_ashr_1(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_3(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_ashr_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #3
@@ -444,7 +444,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_3(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_4(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_ashr_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #4
@@ -456,7 +456,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_4(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_5(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_ashr_5:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #5
@@ -468,7 +468,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_5(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_6(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_ashr_6:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #6
@@ -565,7 +565,7 @@ define <8 x i16> @test_128_i16_x_8_127_mask_shl_10(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_shl_3(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_shl_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.8h, v0.8h, #3
@@ -577,7 +577,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_shl_3(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_shl_4(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_shl_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.8h, v0.8h, #4
@@ -589,7 +589,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_shl_4(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_shl_5(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_shl_5:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.8h, v0.8h, #5
@@ -601,7 +601,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_shl_5(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_shl_6(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_shl_6:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.8h, v0.8h, #6
@@ -644,7 +644,7 @@ define <4 x i32> @test_128_i32_x_4_32767_mask_lshr_1(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_7(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_lshr_7:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #7
@@ -656,7 +656,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_7(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_8(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_lshr_8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #8
@@ -668,7 +668,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_8(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_9(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_lshr_9:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #9
@@ -680,7 +680,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_9(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_10(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_lshr_10:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #10
@@ -748,7 +748,7 @@ define <4 x i32> @test_128_i32_x_4_32767_mask_ashr_1(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_7(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_ashr_7:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #7
@@ -760,7 +760,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_7(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_8(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_ashr_8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #8
@@ -772,7 +772,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_8(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_9(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_ashr_9:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #9
@@ -784,7 +784,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_9(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_10(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_ashr_10:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #10
@@ -881,7 +881,7 @@ define <4 x i32> @test_128_i32_x_4_32767_mask_shl_18(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_7(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_shl_7:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.4s, v0.4s, #7
@@ -893,7 +893,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_7(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_8(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_shl_8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.4s, v0.4s, #8
@@ -905,7 +905,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_8(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_9(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_shl_9:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.4s, v0.4s, #9
@@ -917,7 +917,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_9(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_10(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_shl_10:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.4s, v0.4s, #10
@@ -948,7 +948,7 @@ define <4 x i32> @test_128_i32_x_4_4294836224_mask_shl_1(<4 x i32> %a0) {
define <2 x i64> @test_128_i64_x_2_2147483647_mask_lshr_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_2147483647_mask_lshr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147483647
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #1
@@ -961,7 +961,7 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_lshr_1(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_15(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_15:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #15
@@ -973,7 +973,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_15(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_16(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #16
@@ -985,7 +985,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_16(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_17(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_17:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #17
@@ -997,7 +997,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_17(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_18(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_18:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #18
@@ -1010,7 +1010,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_18(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_lshr_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_18446744065119617024_mask_lshr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-8589934592
+; CHECK-NEXT: mov x8, #-8589934592 // =0xfffffffe00000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #1
@@ -1022,7 +1022,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_lshr_1(<2 x i64> %a
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_lshr_32(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_18446744065119617024_mask_lshr_32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-8589934592
+; CHECK-NEXT: mov x8, #-8589934592 // =0xfffffffe00000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #32
@@ -1055,7 +1055,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_lshr_34(<2 x i64> %
define <2 x i64> @test_128_i64_x_2_2147483647_mask_ashr_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_2147483647_mask_ashr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147483647
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #1
@@ -1068,7 +1068,7 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_ashr_1(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_15(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #15
@@ -1080,7 +1080,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_15(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_16(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #16
@@ -1092,7 +1092,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_16(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_17(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_17:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #17
@@ -1104,7 +1104,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_17(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_18(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_18:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #18
@@ -1117,7 +1117,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_18(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-8589934592
+; CHECK-NEXT: mov x8, #-8589934592 // =0xfffffffe00000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: sshr v0.2d, v0.2d, #1
@@ -1129,7 +1129,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_1(<2 x i64> %a
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-8589934592
+; CHECK-NEXT: mov x8, #-8589934592 // =0xfffffffe00000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: sshr v0.2d, v0.2d, #32
@@ -1162,7 +1162,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_34(<2 x i64> %
define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_2147483647_mask_shl_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147483647
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: add v0.2d, v0.2d, v0.2d
@@ -1174,7 +1174,7 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_1(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_32(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_2147483647_mask_shl_32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147483647
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #32
@@ -1205,7 +1205,7 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_34(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_15(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_shl_15:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #15
@@ -1217,7 +1217,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_15(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_16(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_shl_16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #16
@@ -1229,7 +1229,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_16(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_17(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_shl_17:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #17
@@ -1241,7 +1241,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_17(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_18(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_shl_18:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #18
@@ -1254,7 +1254,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_18(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_shl_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-8589934592
+; CHECK-NEXT: mov x8, #-8589934592 // =0xfffffffe00000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: add v0.2d, v0.2d, v0.2d
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 99755133f36d6a..a693e13f37ea36 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefixes=R600,ALL
; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=GFX6,GFX678,ALL
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8,GFX678,ALL
@@ -17,6 +18,72 @@
; GFX10: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX11: global_store_b64 v2, v[0:1], s[0:1]
define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
+; R600-LABEL: build_vector2:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.Y, literal.x,
+; R600-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; R600-NEXT: MOV T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 5(7.006492e-45), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_vector2:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, 5
+; GFX6-NEXT: v_mov_b32_e32 v1, 6
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector2:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v0, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, 6
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector2:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 5
+; GFX10-NEXT: v_mov_b32_e32 v1, 6
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector2:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, 5
+; GFX11-NEXT: v_mov_b32_e32 v1, 6
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector2:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 5
+; GFX940-NEXT: v_mov_b32_e32 v1, 6
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
ret void
@@ -40,6 +107,86 @@ entry:
; GFX10: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX11: global_store_b128 v4, v[0:3], s[0:1]
define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
+; R600-LABEL: build_vector4:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, literal.x,
+; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; R600-NEXT: MOV * T0.Z, literal.x,
+; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
+; R600-NEXT: MOV * T0.Y, literal.x,
+; R600-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; R600-NEXT: MOV T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 5(7.006492e-45), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_vector4:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, 5
+; GFX6-NEXT: v_mov_b32_e32 v1, 6
+; GFX6-NEXT: v_mov_b32_e32 v2, 7
+; GFX6-NEXT: v_mov_b32_e32 v3, 8
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector4:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v0, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, 6
+; GFX8-NEXT: v_mov_b32_e32 v2, 7
+; GFX8-NEXT: v_mov_b32_e32 v3, 8
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector4:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 5
+; GFX10-NEXT: v_mov_b32_e32 v1, 6
+; GFX10-NEXT: v_mov_b32_e32 v2, 7
+; GFX10-NEXT: v_mov_b32_e32 v3, 8
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector4:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, 5
+; GFX11-NEXT: v_mov_b32_e32 v1, 6
+; GFX11-NEXT: v_mov_b32_e32 v2, 7
+; GFX11-NEXT: v_mov_b32_e32 v3, 8
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector4:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v4, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 5
+; GFX940-NEXT: v_mov_b32_e32 v1, 6
+; GFX940-NEXT: v_mov_b32_e32 v2, 7
+; GFX940-NEXT: v_mov_b32_e32 v3, 8
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
ret void
@@ -60,6 +207,65 @@ entry:
; GFX10: global_store_dword v0, v1, s[0:1]
; GFX11: global_store_b32 v0, v1, s[0:1]
define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
+; R600-LABEL: build_vector_v2i16:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV T4.X, literal.x,
+; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; R600-NEXT: 393221(5.510200e-40), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_vector_v2i16:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, 0x60005
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector_v2i16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x60005
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector_v2i16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x60005
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector_v2i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0x60005
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector_v2i16:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out
ret void
@@ -90,6 +296,82 @@ entry:
; GFX10: global_store_dword v0, v1, s[0:1]
; GFX11: global_store_b32 v0, v1, s[0:1]
define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) {
+; R600-LABEL: build_vector_v2i16_trunc:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
+; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT: OR_INT T4.X, PV.W, literal.x,
+; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; R600-NEXT: 327680(4.591775e-40), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_vector_v2i16_trunc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_alignbit_b32 v0, 5, s4, 16
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector_v2i16_trunc:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_lshr_b32 s4, s4, 16
+; GFX8-NEXT: s_or_b32 s4, s4, 0x50000
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector_v2i16_trunc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x8
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_lshr_b32 s2, s2, 16
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, 5
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector_v2i16_trunc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_pack_hl_b32_b16 s2, s2, 5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector_v2i16_trunc:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, 5
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
%srl = lshr i32 %a, 16
%trunc = trunc i32 %srl to i16
%ins.0 = insertelement <2 x i16> undef, i16 %trunc, i32 0
@@ -186,6 +468,93 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_endpgm
define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) {
+; R600-LABEL: build_v2i32_from_v4i16_shuffle:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 0, @10, KC0[], KC1[]
+; R600-NEXT: TEX 1 @6
+; R600-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_16 T1.X, T0.X, 48, #3
+; R600-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
+; R600-NEXT: ALU clause starting at 10:
+; R600-NEXT: MOV * T0.X, 0.0,
+; R600-NEXT: ALU clause starting at 11:
+; R600-NEXT: LSHL * T0.Y, T1.X, literal.x,
+; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT: LSHL T0.X, T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 16(2.242078e-44), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, s0
+; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_lshl_b32 s0, s3, 16
+; GFX8-NEXT: s_lshl_b32 s1, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: s_lshl_b32 s3, s3, 16
+; GFX940-NEXT: s_lshl_b32 s2, s2, 16
+; GFX940-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
%shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
%zextended = zext <2 x i16> %shuf to <2 x i32>
@@ -193,3 +562,7 @@ entry:
store <2 x i32> %shifted, ptr addrspace(1) %out
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL: {{.*}}
+; GFX1011: {{.*}}
+; GFX678: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 03ca780c903226..c7677942719de1 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,GFX11 %s
@@ -10,6 +11,45 @@
; GCN: s_xor_b32 [[NEG_VAL:s[0-9]+]], [[VAL]], 0x80000000
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[NEG_VAL]]
define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: s_fneg_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = fsub float -0.000000e+00, %in
store float %fneg, ptr addrspace(1) %out
ret void
@@ -22,6 +62,52 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) {
+; SI-LABEL: s_fneg_v2f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; SI-NEXT: s_xor_b32 s1, s2, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; VI-NEXT: s_xor_b32 s1, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
store <2 x float> %fneg, ptr addrspace(1) %out
ret void
@@ -38,6 +124,61 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) {
+; SI-LABEL: s_fneg_v4f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s7, s7, 0x80000000
+; SI-NEXT: s_xor_b32 s6, s6, 0x80000000
+; SI-NEXT: s_xor_b32 s5, s5, 0x80000000
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v4f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s7, s7, 0x80000000
+; VI-NEXT: s_xor_b32 s6, s6, 0x80000000
+; VI-NEXT: s_xor_b32 s5, s5, 0x80000000
+; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000
+; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000
+; GFX11-NEXT: s_xor_b32 s4, s4, 0x80000000
+; GFX11-NEXT: s_xor_b32 s5, s5, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
store <4 x float> %fneg, ptr addrspace(1) %out
ret void
@@ -54,6 +195,41 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl
; R600-NOT: XOR
; R600: -KC0[2].Z
define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: fsub0_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_sub_f32_e64 v0, 0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fsub0_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_sub_f32_e64 v0, 0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fsub0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_f32_e64 v0, 0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%bc = bitcast i32 %in to float
%fsub = fsub float 0.0, %bc
store float %fsub, ptr addrspace(1) %out
@@ -71,6 +247,45 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) {
; R600-NOT: XOR
; R600: -PV.W
define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: fneg_free_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_free_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fneg_free_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%bc = bitcast i32 %in to float
%fsub = fsub float -0.0, %bc
store float %fsub, ptr addrspace(1) %out
@@ -84,6 +299,41 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) {
; GCN-NOT: xor
; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: fneg_fold_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mul_f32_e64 v0, -s4, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fold_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e64 v0, -s4, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fneg_fold_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mul_f32_e64 v0, -s2, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fsub = fsub float -0.0, %in
%fmul = fmul float %fsub, %in
store float %fmul, ptr addrspace(1) %out
@@ -94,6 +344,41 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
; FUNC-LABEL: {{^}}bitpreserve_fneg_f32:
; GCN: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -4.0
define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: bitpreserve_fneg_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mul_f32_e64 v0, s4, -4.0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: bitpreserve_fneg_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e64 v0, s4, -4.0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitpreserve_fneg_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mul_f32_e64 v0, s2, -4.0
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%in.bc = bitcast float %in to i32
%int.abs = xor i32 %in.bc, 2147483648
%bc = bitcast i32 %int.abs to float
@@ -107,6 +392,45 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in
; GCN: s_xor_b32 [[FNEG:s[0-9]+]], [[IN]], 0x80000000
; GCN: v_mov_b32_e32 [[V_FNEG:v[0-9]+]], [[FNEG]]
define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_fneg_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i32 %in, -2147483648
store i32 %fneg, ptr addrspace(1) %out
ret void
@@ -117,6 +441,11 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) {
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: s_setpc_b64
define i32 @v_fneg_i32(i32 %in) {
+; FUNC-LABEL: v_fneg_i32:
+; FUNC: ; %bb.0:
+; FUNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FUNC-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; FUNC-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i32 %in, -2147483648
ret i32 %fneg
}
@@ -125,6 +454,41 @@ define i32 @v_fneg_i32(i32 %in) {
; GCN: s_load_{{dword|b32}} [[IN:s[0-9]+]]
; GCN: v_sub_f32_e64 v{{[0-9]+}}, 2.0, [[IN]]
define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_fneg_i32_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_sub_f32_e64 v0, 2.0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i32_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_sub_f32_e64 v0, 2.0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i32_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_f32_e64 v0, 2.0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i32 %in, -2147483648
%bitcast = bitcast i32 %fneg to float
%fadd = fadd float %bitcast, 2.0
@@ -137,6 +501,11 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0
; GCN-NEXT: s_setpc_b64
define float @v_fneg_i32_fp_use(i32 %in) {
+; FUNC-LABEL: v_fneg_i32_fp_use:
+; FUNC: ; %bb.0:
+; FUNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FUNC-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; FUNC-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i32 %in, -2147483648
%bitcast = bitcast i32 %fneg to float
%fadd = fadd float %bitcast, 2.0
@@ -146,6 +515,49 @@ define float @v_fneg_i32_fp_use(i32 %in) {
; FUNC-LABEL: {{^}}s_fneg_i64:
; GCN: s_xor_b32 s[[NEG_HI:[0-9]+]], s{{[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: s_fneg_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i64 %in, -9223372036854775808
store i64 %fneg, ptr addrspace(1) %out
ret void
@@ -156,6 +568,11 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) {
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GCN-NEXT: s_setpc_b64
define i64 @v_fneg_i64(i64 %in) {
+; FUNC-LABEL: v_fneg_i64:
+; FUNC: ; %bb.0:
+; FUNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FUNC-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; FUNC-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i64 %in, -9223372036854775808
ret i64 %fneg
}
@@ -163,6 +580,39 @@ define i64 @v_fneg_i64(i64 %in) {
; FUNC-LABEL: {{^}}s_fneg_i64_fp_use:
; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, -s{{\[[0-9]+:[0-9]+\]}}, 2.0
define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: s_fneg_i64_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i64_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i64_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i64 %in, -9223372036854775808
%bitcast = bitcast i64 %fneg to double
%fadd = fadd double %bitcast, 2.0
@@ -175,6 +625,11 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0
; GCN-NEXT: s_setpc_b64
define double @v_fneg_i64_fp_use(i64 %in) {
+; FUNC-LABEL: v_fneg_i64_fp_use:
+; FUNC: ; %bb.0:
+; FUNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FUNC-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0
+; FUNC-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i64 %in, -9223372036854775808
%bitcast = bitcast i64 %fneg to double
%fadd = fadd double %bitcast, 2.0
@@ -186,6 +641,11 @@ define double @v_fneg_i64_fp_use(i64 %in) {
; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GCN-NEXT: s_setpc_b64
define i16 @v_fneg_i16(i16 %in) {
+; FUNC-LABEL: v_fneg_i16:
+; FUNC: ; %bb.0:
+; FUNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FUNC-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; FUNC-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i16 %in, -32768
ret i16 %fneg
}
@@ -198,6 +658,43 @@ define i16 @v_fneg_i16(i16 %in) {
; VI: s_load_dword [[IN:s[0-9]+]]
; VI: v_sub_f16_e64 v{{[0-9]+}}, 2.0, [[IN]]
define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
+; SI-LABEL: s_fneg_i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_sub_f16_e64 v0, 2.0, s4
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_f16_e64 v0, 2.0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i16 %in, -32768
%bitcast = bitcast i16 %fneg to half
%fadd = fadd half %bitcast, 2.0
@@ -215,6 +712,24 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0
; VI-NEXT: s_setpc_b64
define half @v_fneg_i16_fp_use(i16 %in) {
+; SI-LABEL: v_fneg_i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_fneg_i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_f16_e32 v0, 2.0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i16 %in, -32768
%bitcast = bitcast i16 %fneg to half
%fadd = fadd half %bitcast, 2.0
@@ -231,6 +746,50 @@ define half @v_fneg_i16_fp_use(i16 %in) {
; VI: s_lshl_b32 s5, s5, 16
; VI: s_or_b32 s4, s4, s5
define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
+; SI-LABEL: s_fneg_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80008000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s5, s4, 16
+; VI-NEXT: s_xor_b32 s4, s4, 0x8000
+; VI-NEXT: s_xor_b32 s5, s5, 0x8000
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%in = bitcast i32 %arg to <2 x i16>
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
store <2 x i16> %fneg, ptr addrspace(1) %out
@@ -249,6 +808,28 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; VI-NEXT: s_setpc_b64
define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) {
+; SI-LABEL: v_fneg_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_fneg_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
ret <2 x i16> %fneg
}
@@ -268,6 +849,56 @@ define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) {
; VI: v_add_f16_e64 v1, s4, 2.0
; VI: v_or_b32_e32 v0, v1, v0
define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) {
+; SI-LABEL: s_fneg_v2i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v2i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v1, 0x4000
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s5, s4, 16
+; VI-NEXT: s_xor_b32 s5, s5, 0x8000
+; VI-NEXT: s_xor_b32 s4, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v0, s5
+; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_e64 v1, s4, 2.0
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v2i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%in = bitcast i32 %arg to <2 x i16>
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
%bitcast = bitcast <2 x i16> %fneg to <2 x half>
@@ -290,9 +921,35 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
; VI: v_or_b32_e32 v0, v0, v1
; VI: s_setpc_b64
define <2 x half> @v_fneg_v2i16_fp_use(i32 %arg) {
+; SI-LABEL: v_fneg_v2i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_fneg_v2i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, 0x4000
+; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_v2i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%in = bitcast i32 %arg to <2 x i16>
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
%bitcast = bitcast <2 x i16> %fneg to <2 x half>
%fadd = fadd <2 x half> %bitcast, <half 2.0, half 2.0>
ret <2 x half> %fadd
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index 1a73df341108fe..8a9d731334ec5f 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HSA-VI,FUNC %s
; Repeat of some problematic tests in kernel-args.ll, with the IR
@@ -11,6 +12,16 @@
; HSA-VI: .amdhsa_kernarg_size 12
define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
+; GCN-LABEL: i1_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s2, s2, 1
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_byte v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
store i1 %x, ptr addrspace(1) %out, align 1
ret void
}
@@ -22,6 +33,20 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
; HSA-VI: .amdhsa_kernarg_size 12
define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind {
+; GCN-LABEL: v3i8_arg:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshr_b32 s3, s2, 8
+; GCN-NEXT: s_and_b32 s4, s2, 0xff
+; GCN-NEXT: v_lshlrev_b16_e64 v2, 8, s3
+; GCN-NEXT: v_or_b32_e32 v2, s4, v2
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_short v0, v2, s[0:1]
+; GCN-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:2
+; GCN-NEXT: s_endpgm
entry:
store <3 x i8> %in, ptr addrspace(1) %out, align 4
ret void
@@ -32,6 +57,19 @@ entry:
; HSA-VI: .amdhsa_kernarg_size 24
define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind {
+; GCN-LABEL: i65_arg:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s6, s[4:5], 0x10
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s4, s6, 1
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: global_store_byte v2, v3, s[0:1] offset:8
+; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GCN-NEXT: s_endpgm
entry:
store i65 %in, ptr addrspace(1) %out, align 4
ret void
@@ -40,6 +78,9 @@ entry:
; FUNC-LABEL: {{^}}empty_struct_arg:
; HSA-VI: .amdhsa_kernarg_size 0
define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
+; GCN-LABEL: empty_struct_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_endpgm
ret void
}
@@ -61,6 +102,30 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
; HSA-VI: .amdhsa_kernarg_size 40
define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
+; GCN-LABEL: struct_argument_alignment:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s6, s[4:5], 0x0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GCN-NEXT: s_load_dword s7, s[4:5], 0x18
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = extractvalue {i32, i64} %arg0, 0
%val1 = extractvalue {i32, i64} %arg0, 1
%val2 = extractvalue {i32, i64} %arg1, 0
@@ -83,6 +148,28 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
; HSA-VI: .amdhsa_kernarg_size 28
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
+; GCN-LABEL: packed_struct_argument_alignment:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: global_load_dword v6, v2, s[4:5] offset:13
+; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v7, s2
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: global_store_dword v[2:3], v7, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v[2:3], v6, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = extractvalue <{i32, i64}> %arg0, 0
%val1 = extractvalue <{i32, i64}> %arg0, 1
%val2 = extractvalue <{i32, i64}> %arg1, 0
@@ -103,6 +190,37 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; HSA-VI: .amdhsa_kernarg_size 64
define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
+; GCN-LABEL: struct_argument_alignment_after:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s10, s[4:5], 0x0
+; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
+; GCN-NEXT: s_load_dword s11, s[4:5], 0x18
+; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NEXT: global_store_dword v[4:5], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NEXT: global_store_dword v[4:5], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = extractvalue {i32, i64} %arg0, 0
%val1 = extractvalue {i32, i64} %arg0, 1
%val2 = extractvalue {i32, i64} %arg2, 0
@@ -118,6 +236,23 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8,
; GCN-LABEL: {{^}}array_3xi32:
; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
+; GCN-LABEL: array_3xi32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: global_store_short v[0:1], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v[0:1], v1, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: global_store_dword v[0:1], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
store volatile i16 %arg0, ptr addrspace(1) undef
store volatile [3 x i32] %arg1, ptr addrspace(1) undef
ret void
@@ -126,6 +261,21 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
; GCN-LABEL: {{^}}array_3xi16:
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
+; GCN-LABEL: array_3xi16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: global_store_byte v[0:1], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_short_d16_hi v[0:1], v1, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_short v[0:1], v1, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_short_d16_hi v[0:1], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
store volatile i8 %arg0, ptr addrspace(1) undef
store volatile [3 x i16] %arg1, ptr addrspace(1) undef
ret void
@@ -136,6 +286,20 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
; GCN-DAG: s_bfe_u32 [[BFE:s[0-9]+]], [[DWORD]], 0x100010{{$}}
; GCN-DAG: s_and_b32 [[AND:s[0-9]+]], [[DWORD]], 0x7fff{{$}}
define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15> %in) {
+; GCN-LABEL: v2i15_arg:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s3, s2, 0x7fff
+; GCN-NEXT: s_bfe_u32 s2, s2, 0x100010
+; GCN-NEXT: s_lshl_b32 s2, s2, 15
+; GCN-NEXT: s_or_b32 s2, s3, s2
+; GCN-NEXT: s_andn2_b32 s2, s2, -2.0
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
entry:
store <2 x i15> %in, ptr addrspace(1) %out, align 4
ret void
@@ -148,6 +312,25 @@ entry:
; GCN: s_and_b32
; GCN: s_or_b32
define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15> %in) {
+; GCN-LABEL: v3i15_arg:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s4, s3, 0xffff
+; GCN-NEXT: s_and_b32 s5, s2, 0x7fff
+; GCN-NEXT: s_lshr_b32 s6, s2, 1
+; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 30
+; GCN-NEXT: s_and_b32 s4, s6, 0x3fff8000
+; GCN-NEXT: s_and_b32 s6, s3, 0x1fff
+; GCN-NEXT: s_or_b32 s4, s5, s4
+; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT: global_store_short v0, v1, s[0:1] offset:4
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
entry:
store <3 x i15> %in, ptr addrspace(1) %out, align 4
ret void
@@ -159,6 +342,14 @@ entry:
; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8
; GCN: .amdhsa_kernarg_size 12
define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) {
+; GCN-LABEL: byref_constant_i8_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_load_ubyte v1, v0, s[4:5] offset:8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
%in = load i8, ptr addrspace(4) %in.byref
%ext = zext i8 %in to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -170,6 +361,14 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out
; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8
; GCN: .amdhsa_kernarg_size 12
define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) {
+; GCN-LABEL: byref_constant_i16_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] offset:8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
%in = load i16, ptr addrspace(4) %in.byref
%ext = zext i16 %in to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -180,6 +379,18 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou
; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
; GCN: .amdhsa_kernarg_size 16
define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) {
+; GCN-LABEL: byref_constant_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v0, v2, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store volatile i32 %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -191,6 +402,23 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x20{{$}}
; GCN: .amdhsa_kernarg_size 36
define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) {
+; GCN-LABEL: byref_constant_v4i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GCN-NEXT: s_load_dword s8, s[4:5], 0x20
+; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: v_mov_b32_e32 v5, s8
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v4, v5, s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%in = load <4 x i32>, ptr addrspace(4) %in.byref
store volatile <4 x i32> %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -205,6 +433,19 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %
; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s
; GCN: .amdhsa_kernarg_size 264
define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+; GCN-LABEL: byref_align_constant_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: global_store_dword v0, v1, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v0, v2, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store volatile i32 %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -216,6 +457,41 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}}
; GCN: .amdhsa_kernarg_size 132
define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) {
+; GCN-LABEL: byref_natural_align_constant_v16i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x80
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NEXT: v_mov_b32_e32 v3, s23
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s16
+; GCN-NEXT: v_mov_b32_e32 v1, s17
+; GCN-NEXT: v_mov_b32_e32 v2, s18
+; GCN-NEXT: v_mov_b32_e32 v3, s19
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NEXT: v_mov_b32_e32 v2, s14
+; GCN-NEXT: v_mov_b32_e32 v3, s15
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: global_store_dword v4, v0, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%in = load <16 x i32>, ptr addrspace(4) %in.byref
store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -227,6 +503,15 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}}
; GCN: .amdhsa_kernarg_size 12
define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) {
+; GCN-LABEL: byref_global_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
%in = load i32, ptr addrspace(1) %in.byref
store i32 %in, ptr addrspace(1) %out, align 4
ret void
@@ -235,6 +520,16 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out,
; GCN-LABEL: {{^}}byref_flat_i32_arg:
; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}}
define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) {
+; GCN-LABEL: byref_flat_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: flat_load_dword v0, v[0:1] offset:8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: s_endpgm
%in = load i32, ptr %in.byref
store i32 %in, ptr addrspace(1) %out, align 4
ret void
@@ -245,6 +540,17 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p
; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}}
; GCN: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) {
+; GCN-LABEL: byref_constant_32bit_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s0, s4, 8
+; GCN-NEXT: s_mov_b32 s1, 0
+; GCN-NEXT: s_load_dword s6, s[0:1], 0x0
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: global_store_dword v0, v1, s[2:3]
+; GCN-NEXT: s_endpgm
%in = load i32, ptr addrspace(6) %in.byref
store i32 %in, ptr addrspace(1) %out, align 4
ret void
@@ -260,6 +566,22 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu
; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
; GCN: .amdhsa_kernarg_size 20
define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) {
+; GCN-LABEL: multi_byref_constant_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x10
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v0, v2, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%in0 = load i32, ptr addrspace(4) %in0.byref
%in1 = load i32, ptr addrspace(4) %in1.byref
store volatile i32 %in0, ptr addrspace(1) %out, align 4
@@ -274,6 +596,13 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu
; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x0{{$}}
; GCN: .amdhsa_kernarg_size 4
define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) {
+; GCN-LABEL: byref_constant_i32_arg_offset0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: global_store_dword v[0:1], v0, off
+; GCN-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store i32 %in, ptr addrspace(1) undef, align 4
ret void
@@ -281,3 +610,6 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
+; HSA-VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
index 90e18a881340b3..5d243e3a5890a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
@@ -5,6 +6,18 @@ declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
; CHECK-LABEL: {{^}}ds_bpermute:
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @ds_bpermute(ptr addrspace(1) %out, i32 %index, i32 %src) nounwind {
+; CHECK-LABEL: ds_bpermute:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: ds_bpermute_b32 v2, v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
store i32 %bpermute, ptr addrspace(1) %out, align 4
ret void
@@ -13,6 +26,18 @@ define amdgpu_kernel void @ds_bpermute(ptr addrspace(1) %out, i32 %index, i32 %s
; CHECK-LABEL: {{^}}ds_bpermute_imm_offset:
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
define amdgpu_kernel void @ds_bpermute_imm_offset(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; CHECK-LABEL: ds_bpermute_imm_offset:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: ds_bpermute_b32 v2, v0, v1 offset:4
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
%index = add i32 %base_index, 4
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
store i32 %bpermute, ptr addrspace(1) %out, align 4
@@ -22,6 +47,19 @@ define amdgpu_kernel void @ds_bpermute_imm_offset(ptr addrspace(1) %out, i32 %ba
; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
define amdgpu_kernel void @ds_bpermute_imm_index(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; CHECK-LABEL: ds_bpermute_imm_index:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0xc
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: ds_bpermute_b32 v2, v0, v1 offset:64
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
store i32 %bpermute, ptr addrspace(1) %out, align 4
ret void
@@ -31,6 +69,15 @@ define amdgpu_kernel void @ds_bpermute_imm_index(ptr addrspace(1) %out, i32 %bas
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
; CHECK: s_waitcnt lgkmcnt
define void @ds_bpermute_add_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; CHECK-LABEL: ds_bpermute_add_shl:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshlrev_b32_e32 v2, 2, v2
+; CHECK-NEXT: ds_bpermute_b32 v2, v2, v3 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%index = add i32 %base_index, 1
%byte_index = shl i32 %index, 2
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
@@ -42,6 +89,16 @@ define void @ds_bpermute_add_shl(ptr addrspace(1) %out, i32 %base_index, i32 %sr
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
; CHECK: s_waitcnt lgkmcnt
define void @ds_bpermute_or_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; CHECK-LABEL: ds_bpermute_or_shl:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v2, 62, v2
+; CHECK-NEXT: v_lshlrev_b32_e32 v2, 2, v2
+; CHECK-NEXT: ds_bpermute_b32 v2, v2, v3 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%masked = and i32 %base_index, 62
%index = or i32 %masked, 1
%byte_index = shl i32 %index, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
index 25b36173cc65b5..61b4d240ef6942 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
@@ -554,3 +555,5 @@ declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
index 5e03748bee08f5..94fbd0137a509e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,VI
@@ -7,6 +8,14 @@
;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
;CHECK: s_waitcnt
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+; CHECK-LABEL: buffer_load:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v8, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v8, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_load_dwordx4 v[4:7], v8, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_dwordx4 v[8:11], v8, s[0:3], 0 idxen slc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
%data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -21,6 +30,12 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+; CHECK-LABEL: buffer_load_immoffs:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:40
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0)
ret <4 x float> %data
@@ -31,6 +46,13 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+; CHECK-LABEL: buffer_load_immoffs_large:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_movk_i32 s4, 0x1ffc
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], s4 idxen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 4, i32 8188, i32 0)
ret <4 x float> %data
@@ -40,6 +62,11 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
+; CHECK-LABEL: buffer_load_idx:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0)
ret <4 x float> %data
@@ -49,6 +76,14 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0)
ret <4 x float> %data
@@ -58,6 +93,14 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs_imm:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%ofs = add i32 %1, 60
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0)
@@ -68,6 +111,11 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0)
ret <4 x float> %data
@@ -78,6 +126,12 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both_reversed:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0)
ret <4 x float> %data
@@ -87,6 +141,11 @@ main_body:
;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x1:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
ret float %data
@@ -96,6 +155,11 @@ main_body:
;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x2:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
ret <2 x float> %data
@@ -105,6 +169,14 @@ main_body:
;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, -16, v0
;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen
define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
+; VI-LABEL: buffer_load_negative_offset:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ; return to shader part epilog
main_body:
%ofs.1 = add i32 %ofs, -16
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i32 0, i32 0)
@@ -117,6 +189,16 @@ main_body:
; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) {
+; VI-LABEL: buffer_load_mmo:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: buffer_load_dword v1, v2, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: ds_write2_b32 v0, v2, v2 offset1:4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, v1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ; return to shader part epilog
entry:
store float 0.0, ptr addrspace(3) %lds
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -131,6 +213,14 @@ entry:
;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
;CHECK: s_waitcnt
define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) {
+; CHECK-LABEL: buffer_load_int:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v6, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_load_dwordx2 v[4:5], v6, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_dword v6, v6, s[0:3], 0 idxen slc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
%data_glc = call <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -151,6 +241,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_buffer_load_ubyte:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = zext i8 %tmp to i32
@@ -165,6 +261,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_buffer_load_ushort:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = zext i16 %tmp to i32
@@ -179,6 +281,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_buffer_load_sbyte:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_sbyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = sext i8 %tmp to i32
@@ -193,6 +301,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_buffer_load_sshort:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_sshort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = sext i16 %tmp to i32
@@ -206,6 +320,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b16 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store half %val, ptr addrspace(3) %ptr
@@ -218,6 +339,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_v2f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x half> %val, ptr addrspace(3) %ptr
@@ -230,6 +358,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_v4f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2]
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x half> %val, ptr addrspace(3) %ptr
@@ -242,6 +377,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b16 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store i16 %val, ptr addrspace(3) %ptr
@@ -254,6 +396,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_v2i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x i16> %val, ptr addrspace(3) %ptr
@@ -266,6 +415,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_v4i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2]
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x i16> %val, ptr addrspace(3) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
index 2f9e6b0a1cf526..71adf4b2aaeab6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,VI
@@ -7,6 +8,14 @@
;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
;CHECK: s_waitcnt
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v8, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v8, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_load_dwordx4 v[4:7], v8, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_dwordx4 v[8:11], v8, s[0:3], 0 idxen slc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
%data_glc = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -21,6 +30,12 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_immoffs:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:40
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 40, i32 0, i32 0)
ret <4 x float> %data
@@ -31,6 +46,13 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_immoffs_large:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_movk_i32 s4, 0x1ffc
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], s4 idxen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 4, i32 8188, i32 0)
ret <4 x float> %data
@@ -40,6 +62,11 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_idx:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0)
ret <4 x float> %data
@@ -49,6 +76,14 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0)
ret <4 x float> %data
@@ -58,6 +93,14 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs_imm:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%ofs = add i32 %1, 60
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0)
@@ -68,6 +111,11 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0)
ret <4 x float> %data
@@ -78,6 +126,12 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both_reversed:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0)
ret <4 x float> %data
@@ -87,6 +141,11 @@ main_body:
;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps float @buffer_load_x1(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x1:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
ret float %data
@@ -96,6 +155,11 @@ main_body:
;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <2 x float> @buffer_load_x2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x2:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
ret <2 x float> %data
@@ -105,6 +169,14 @@ main_body:
;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, -16, v0
;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen
define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) {
+; VI-LABEL: buffer_load_negative_offset:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ; return to shader part epilog
main_body:
%ofs.1 = add i32 %ofs, -16
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs.1, i32 0, i32 0)
@@ -117,6 +189,16 @@ main_body:
; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
define amdgpu_ps float @buffer_load_mmo(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %lds) {
+; VI-LABEL: buffer_load_mmo:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: buffer_load_dword v1, v2, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: ds_write2_b32 v0, v2, v2 offset1:4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, v1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ; return to shader part epilog
entry:
store float 0.0, ptr addrspace(3) %lds
%val = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -131,6 +213,14 @@ entry:
;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
;CHECK: s_waitcnt
define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_int:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v6, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_load_dwordx2 v[4:5], v6, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_dword v6, v6, s[0:3], 0 idxen slc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x i32> @llvm.amdgcn.struct.ptr.buffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
%data_glc = call <2 x i32> @llvm.amdgcn.struct.ptr.buffer.load.v2i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -151,6 +241,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_ptr_buffer_load_ubyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_ubyte:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i8 @llvm.amdgcn.struct.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = zext i8 %tmp to i32
@@ -165,6 +261,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_ptr_buffer_load_ushort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_ushort:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = zext i16 %tmp to i32
@@ -179,6 +281,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_ptr_buffer_load_sbyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_sbyte:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_sbyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i8 @llvm.amdgcn.struct.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = sext i8 %tmp to i32
@@ -193,6 +301,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_ptr_buffer_load_sshort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_sshort:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_sshort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = sext i16 %tmp to i32
@@ -206,6 +320,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b16 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call half @llvm.amdgcn.struct.ptr.buffer.load.f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store half %val, ptr addrspace(3) %ptr
@@ -218,6 +339,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v2f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x half> %val, ptr addrspace(3) %ptr
@@ -230,6 +358,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v4f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2]
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x half> %val, ptr addrspace(3) %ptr
@@ -242,6 +377,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b16 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store i16 %val, ptr addrspace(3) %ptr
@@ -254,6 +396,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v2i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <2 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v2i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x i16> %val, ptr addrspace(3) %ptr
@@ -266,6 +415,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v4i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2]
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <4 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x i16> %val, ptr addrspace(3) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index ab035b9de04b9d..f9ff7609755a93 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,GCN,SI-NOHSA,FUNC %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI,VI-NOHSA,GCN,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
@@ -15,6 +16,38 @@
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_x:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x6
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_x:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x18
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_x:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.X, KC0[1].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call i32 @llvm.r600.read.local.size.x() #0
store i32 %0, ptr addrspace(1) %out
@@ -30,6 +63,38 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_y:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x7
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_y:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x1c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_y:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.X, KC0[1].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call i32 @llvm.r600.read.local.size.y() #0
store i32 %0, ptr addrspace(1) %out
@@ -45,6 +110,38 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_z:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_z:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_z:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.X, KC0[2].X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call i32 @llvm.r600.read.local.size.z() #0
store i32 %0, ptr addrspace(1) %out
@@ -58,6 +155,40 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_xy:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s4, s4, s5
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_xy:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x18
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s4, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_xy:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[1].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%y = call i32 @llvm.r600.read.local.size.y() #0
@@ -77,6 +208,42 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_xz:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s2, s[0:1], 0x6
+; SI-NEXT: s_load_dword s4, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s4, s2, s4
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_xz:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x18
+; VI-NEXT: s_load_dword s5, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s4, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_xz:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[2].X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%z = call i32 @llvm.r600.read.local.size.z() #0
@@ -95,6 +262,42 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_yz:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x7
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s0, s0, s1
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_yz:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x1c
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s0, s0, s1
+; VI-NEXT: s_mov_b32 s4, s2
+; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_yz:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T1.X, KC0[1].W, KC0[2].X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%y = call i32 @llvm.r600.read.local.size.y() #0
%z = call i32 @llvm.r600.read.local.size.z() #0
@@ -116,6 +319,45 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_xyz:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6
+; SI-NEXT: s_load_dword s2, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s4, s4, s5
+; SI-NEXT: s_add_i32 s4, s4, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_xyz:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x18
+; VI-NEXT: s_load_dword s6, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s4, s4, s5
+; VI-NEXT: s_add_i32 s4, s4, s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_xyz:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: MULLO_INT * T0.X, KC0[1].Z, KC0[1].W,
+; EG-NEXT: ADD_INT T0.X, PS, KC0[2].X,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%y = call i32 @llvm.r600.read.local.size.y() #0
@@ -133,6 +375,38 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_x_known_bits:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x6
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_x_known_bits:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x18
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_x_known_bits:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T1.X, KC0[1].Z, literal.y,
+; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
entry:
%size = call i32 @llvm.r600.read.local.size.x() #0
%shl = shl i32 %size, 16
@@ -148,6 +422,38 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_y_known_bits:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x7
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_y_known_bits:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x1c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_y_known_bits:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T1.X, KC0[1].W, literal.y,
+; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
entry:
%size = call i32 @llvm.r600.read.local.size.y() #0
%shl = shl i32 %size, 16
@@ -163,6 +469,38 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_z_known_bits:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_z_known_bits:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_z_known_bits:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T1.X, KC0[2].X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
entry:
%size = call i32 @llvm.r600.read.local.size.z() #0
%shl = shl i32 %size, 16
@@ -176,3 +514,8 @@ declare i32 @llvm.r600.read.local.size.y() #0
declare i32 @llvm.r600.read.local.size.z() #0
attributes #0 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
+; GCN: {{.*}}
+; SI-NOHSA: {{.*}}
+; VI-NOHSA: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 4cc469b9b49a06..f7eb42a5f93227 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI,MUBUF %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI,MUBUF %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s
@@ -82,6 +83,1028 @@
; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off{{$}}
define amdgpu_ps float @ps_main(i32 %idx) {
+; SI-LABEL: ps_main:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s0, s0, s4
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: ps_main:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s3, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s0, s0, s4
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: ps_main:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX9-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: ps_main:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: ps_main:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: ps_main:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: ps_main:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s2
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: ps_main:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: ps_main:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s2, s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: ps_main:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%r = fadd float %v1, %v2
@@ -135,6 +1158,1028 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_vs float @vs_main(i32 %idx) {
+; SI-LABEL: vs_main:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s0, s0, s4
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: vs_main:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s3, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s0, s0, s4
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: vs_main:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX9-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: vs_main:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: vs_main:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: vs_main:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: vs_main:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s2
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: vs_main:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: vs_main:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s2, s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: vs_main:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%r = fadd float %v1, %v2
@@ -185,6 +2230,1032 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_cs float @cs_main(i32 %idx) {
+; SI-LABEL: cs_main:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s0, s0, s4
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: cs_main:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s3, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s0, s0, s4
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: cs_main:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX9-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: cs_main:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: cs_main:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: cs_main:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v27, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v27
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v5
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v7
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v27
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: cs_main:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s2
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: cs_main:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v27, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v27
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v7
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v27
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: cs_main:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s2, s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: cs_main:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%r = fadd float %v1, %v2
@@ -217,6 +3288,1025 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_hs float @hs_main(i32 %idx) {
+; SI-LABEL: hs_main:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s0, s0, s4
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: hs_main:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s3, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s0, s0, s4
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: hs_main:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: hs_main:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: hs_main:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: hs_main:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: hs_main:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: hs_main:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: hs_main:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: hs_main:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%r = fadd float %v1, %v2
@@ -268,6 +4358,1025 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_gs float @gs_main(i32 %idx) {
+; SI-LABEL: gs_main:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s0, s0, s4
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: gs_main:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s3, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s0, s0, s4
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: gs_main:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: gs_main:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: gs_main:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: gs_main:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: gs_main:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: gs_main:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: gs_main:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: gs_main:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%r = fadd float %v1, %v2
@@ -327,6 +5436,1032 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
+; SI-LABEL: hs_ir_uses_scratch_offset:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s11, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s8, s8, s6
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s9, s9, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; SI-NEXT: s_mov_b32 s2, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: hs_ir_uses_scratch_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s11, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s8, s8, s6
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; VI-NEXT: s_mov_b32 s2, s5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: hs_ir_uses_scratch_offset:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: hs_ir_uses_scratch_offset:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: hs_ir_uses_scratch_offset:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: hs_ir_uses_scratch_offset:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_mov_b32 s2, s7
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: hs_ir_uses_scratch_offset:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_mov_b32 s2, s7
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: hs_ir_uses_scratch_offset:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: hs_ir_uses_scratch_offset:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: hs_ir_uses_scratch_offset:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v30, v13
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_mov_b32 s2, s5
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%f = fadd float %v1, %v2
@@ -382,6 +6517,1032 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
+; SI-LABEL: gs_ir_uses_scratch_offset:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s11, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s8, s8, s6
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s9, s9, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; SI-NEXT: s_mov_b32 s2, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: gs_ir_uses_scratch_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s11, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s8, s8, s6
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; VI-NEXT: s_mov_b32 s2, s5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: gs_ir_uses_scratch_offset:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: gs_ir_uses_scratch_offset:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: gs_ir_uses_scratch_offset:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: gs_ir_uses_scratch_offset:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_mov_b32 s2, s7
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: gs_ir_uses_scratch_offset:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_mov_b32 s2, s7
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: gs_ir_uses_scratch_offset:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: gs_ir_uses_scratch_offset:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: gs_ir_uses_scratch_offset:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v30, v13
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_mov_b32 s2, s5
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%f = fadd float %v1, %v2
@@ -389,3 +7550,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
%r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3
ret <{i32, i32, i32, float}> %r2
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FLATSCR: {{.*}}
+; GCN: {{.*}}
+; GFX9PLUS: {{.*}}
+; GFX9_10-MUBUF: {{.*}}
+; MUBUF: {{.*}}
+; SIVI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index 38672da3c647b0..1b2b9d68fff847 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX89,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX9,GFX89,FUNC %s
@@ -16,6 +17,40 @@
; EG: LSHR * [[ADDR]]
; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
define amdgpu_kernel void @sext_in_reg_i1_i32(ptr addrspace(1) %out, i32 %in) #0 {
+; SI-LABEL: sext_in_reg_i1_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_i32 s4, s2, 0x10000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i1_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_bfe_i32 s0, s2, 0x10000
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i1_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT * T1.X, KC0[2].Z, 0.0, 1,
%shl = shl i32 %in, 31
%sext = ashr i32 %shl, 31
store i32 %sext, ptr addrspace(1) %out
@@ -33,6 +68,59 @@ define amdgpu_kernel void @sext_in_reg_i1_i32(ptr addrspace(1) %out, i32 %in) #0
; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
; EG-NEXT: LSHR * [[ADDR]]
define amdgpu_kernel void @sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
+; SI-LABEL: sext_in_reg_i8_to_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_sext_i32_i8 s2, s2
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: sext_in_reg_i8_to_i32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s4, s0
+; GFX89-NEXT: s_add_i32 s0, s2, s3
+; GFX89-NEXT: s_sext_i32_i8 s0, s0
+; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i8_to_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_add_i32 s0, s2, s3
+; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i8_to_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%c = add i32 %a, %b ; add to prevent folding into extload
%shl = shl i32 %c, 24
%ashr = ashr i32 %shl, 24
@@ -51,6 +139,59 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a,
; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
; EG-NEXT: LSHR * [[ADDR]]
define amdgpu_kernel void @sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
+; SI-LABEL: sext_in_reg_i16_to_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: sext_in_reg_i16_to_i32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s4, s0
+; GFX89-NEXT: s_add_i32 s0, s2, s3
+; GFX89-NEXT: s_sext_i32_i16 s0, s0
+; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i16_to_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_add_i32 s0, s2, s3
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i16_to_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
%c = add i32 %a, %b ; add to prevent folding into extload
%shl = shl i32 %c, 16
%ashr = ashr i32 %shl, 16
@@ -69,6 +210,59 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a,
; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
; EG-NEXT: LSHR * [[ADDR]]
define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_i8_to_v1i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_sext_i32_i8 s2, s2
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: sext_in_reg_i8_to_v1i32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s4, s0
+; GFX89-NEXT: s_add_i32 s0, s2, s3
+; GFX89-NEXT: s_sext_i32_i8 s0, s0
+; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i8_to_v1i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_add_i32 s0, s2, s3
+; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i8_to_v1i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%c = add <1 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <1 x i32> %c, <i32 24>
%ashr = ashr <1 x i32> %shl, <i32 24>
@@ -83,6 +277,53 @@ define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(ptr addrspace(1) %out, <1 x i
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
define amdgpu_kernel void @sext_in_reg_i1_to_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+; SI-LABEL: sext_in_reg_i1_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
+; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i1_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
+; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i1_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, KC0[2].W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T0.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T0.Y, PV.X,
%c = shl i64 %a, %b
%shl = shl i64 %c, 63
%ashr = ashr i64 %shl, 63
@@ -97,6 +338,54 @@ define amdgpu_kernel void @sext_in_reg_i1_to_i64(ptr addrspace(1) %out, i64 %a,
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
define amdgpu_kernel void @sext_in_reg_i8_to_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+; SI-LABEL: sext_in_reg_i8_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
+; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i8_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
+; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i8_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, KC0[2].W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T0.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
%c = shl i64 %a, %b
%shl = shl i64 %c, 56
%ashr = ashr i64 %shl, 56
@@ -112,6 +401,54 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i64(ptr addrspace(1) %out, i64 %a,
; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
define amdgpu_kernel void @sext_in_reg_i16_to_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+; SI-LABEL: sext_in_reg_i16_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
+; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i16_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
+; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i16_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, KC0[2].W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T0.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
%c = shl i64 %a, %b
%shl = shl i64 %c, 48
%ashr = ashr i64 %shl, 48
@@ -126,6 +463,53 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i64(ptr addrspace(1) %out, i64 %a,
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
define amdgpu_kernel void @sext_in_reg_i32_to_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+; SI-LABEL: sext_in_reg_i32_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
+; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i32_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
+; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x200000
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i32_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, KC0[2].W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
%c = shl i64 %a, %b
%shl = shl i64 %c, 32
%ashr = ashr i64 %shl, 32
@@ -161,6 +545,63 @@ define amdgpu_kernel void @sext_in_reg_i32_to_i64(ptr addrspace(1) %out, i64 %a,
; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
+; SI-LABEL: v_sext_in_reg_i1_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 1
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i1_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i1_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -188,6 +629,64 @@ define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(ptr addrspace(1) %out, ptr ad
; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
+; SI-LABEL: v_sext_in_reg_i8_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 8
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i8_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i8_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -215,6 +714,64 @@ define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(ptr addrspace(1) %out, ptr ad
; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
+; SI-LABEL: v_sext_in_reg_i16_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i16_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i16_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -239,6 +796,60 @@ define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(ptr addrspace(1) %out, ptr a
; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[SHR]]]
define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
+; SI-LABEL: v_sext_in_reg_i32_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i32_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i32_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -265,6 +876,61 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(ptr addrspace(1) %out, ptr a
; EG: ASHR [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_i1_in_i32_other_amount(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
+; SI-LABEL: sext_in_reg_i1_in_i32_other_amount:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_bfe_i32 s2, s2, 0x190001
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: sext_in_reg_i1_in_i32_other_amount:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s4, s0
+; GFX89-NEXT: s_add_i32 s0, s2, s3
+; GFX89-NEXT: s_bfe_i32 s0, s0, 0x190001
+; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i1_in_i32_other_amount:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_add_i32 s0, s2, s3
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x190001
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i1_in_i32_other_amount:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 7(9.809089e-45), 2(2.802597e-45)
%c = add i32 %a, %b
%x = shl i32 %c, 6
%y = ashr i32 %x, 7
@@ -288,6 +954,55 @@ define amdgpu_kernel void @sext_in_reg_i1_in_i32_other_amount(ptr addrspace(1) %
; EG: ASHR [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i1_in_v2i32_other_amount:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s6
+; SI-NEXT: s_add_i32 s4, s5, s7
+; SI-NEXT: s_bfe_i32 s4, s4, 0x190001
+; SI-NEXT: s_bfe_i32 s5, s2, 0x190001
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i1_in_v2i32_other_amount:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s4, s6
+; GFX9-NEXT: s_add_i32 s1, s5, s7
+; GFX9-NEXT: s_bfe_i32 s1, s1, 0x190001
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x190001
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i1_in_v2i32_other_amount:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
+; EG-NEXT: ADD_INT T1.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.Y, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 7(9.809089e-45), 6(8.407791e-45)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 7(9.809089e-45), 2(2.802597e-45)
%c = add <2 x i32> %a, %b
%x = shl <2 x i32> %c, <i32 6, i32 6>
%y = ashr <2 x i32> %x, <i32 7, i32 7>
@@ -306,6 +1021,51 @@ define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(ptr addrspace(
; EG: BFE_INT [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i1_to_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s6
+; SI-NEXT: s_add_i32 s4, s5, s7
+; SI-NEXT: s_bfe_i32 s4, s4, 0x10000
+; SI-NEXT: s_bfe_i32 s5, s2, 0x10000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i1_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s4, s6
+; GFX9-NEXT: s_add_i32 s1, s5, s7
+; GFX9-NEXT: s_bfe_i32 s1, s1, 0x10000
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x10000
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i1_to_v2i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, 1,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i32> %c, <i32 31, i32 31>
%ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
@@ -327,6 +1087,67 @@ define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(ptr addrspace(1) %out, <2 x
; EG: BFE_INT [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v4i1_to_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s8
+; SI-NEXT: s_add_i32 s4, s5, s9
+; SI-NEXT: s_add_i32 s5, s6, s10
+; SI-NEXT: s_add_i32 s6, s7, s11
+; SI-NEXT: s_bfe_i32 s6, s6, 0x10000
+; SI-NEXT: s_bfe_i32 s5, s5, 0x10000
+; SI-NEXT: s_bfe_i32 s4, s4, 0x10000
+; SI-NEXT: s_bfe_i32 s7, s2, 0x10000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s7
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s6
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v4i1_to_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s4, s4, s8
+; GFX9-NEXT: s_add_i32 s5, s5, s9
+; GFX9-NEXT: s_add_i32 s6, s6, s10
+; GFX9-NEXT: s_add_i32 s7, s7, s11
+; GFX9-NEXT: s_bfe_i32 s7, s7, 0x10000
+; GFX9-NEXT: s_bfe_i32 s6, s6, 0x10000
+; GFX9-NEXT: s_bfe_i32 s5, s5, 0x10000
+; GFX9-NEXT: s_bfe_i32 s4, s4, 0x10000
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v4i1_to_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[4].X, KC0[5].X,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, 1,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].W, KC0[4].W,
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, 1,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].Z, KC0[4].Z,
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, 1,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].Y, KC0[4].Y,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <4 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
%ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
@@ -344,6 +1165,52 @@ define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(ptr addrspace(1) %out, <4 x
; EG: BFE_INT [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i8_to_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s6
+; SI-NEXT: s_add_i32 s4, s5, s7
+; SI-NEXT: s_sext_i32_i8 s4, s4
+; SI-NEXT: s_sext_i32_i8 s5, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i8_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s4, s6
+; GFX9-NEXT: s_add_i32 s1, s5, s7
+; GFX9-NEXT: s_sext_i32_i8 s1, s1
+; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i8_to_v2i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%c = add <2 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i32> %c, <i32 24, i32 24>
%ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
@@ -365,6 +1232,70 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(ptr addrspace(1) %out, <2 x
; EG: BFE_INT [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v4i8_to_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s8
+; SI-NEXT: s_add_i32 s4, s5, s9
+; SI-NEXT: s_add_i32 s5, s6, s10
+; SI-NEXT: s_add_i32 s6, s7, s11
+; SI-NEXT: s_sext_i32_i8 s6, s6
+; SI-NEXT: s_sext_i32_i8 s5, s5
+; SI-NEXT: s_sext_i32_i8 s4, s4
+; SI-NEXT: s_sext_i32_i8 s7, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s7
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s6
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v4i8_to_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s4, s4, s8
+; GFX9-NEXT: s_add_i32 s5, s5, s9
+; GFX9-NEXT: s_add_i32 s6, s6, s10
+; GFX9-NEXT: s_add_i32 s7, s7, s11
+; GFX9-NEXT: s_sext_i32_i8 s7, s7
+; GFX9-NEXT: s_sext_i32_i8 s6, s6
+; GFX9-NEXT: s_sext_i32_i8 s5, s5
+; GFX9-NEXT: s_sext_i32_i8 s4, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v4i8_to_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[4].X, KC0[5].X,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].W, KC0[4].W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].Z, KC0[4].Z,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].Y, KC0[4].Y,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%c = add <4 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
%ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
@@ -382,6 +1313,52 @@ define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out, <4 x
; EG: BFE_INT [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i16_to_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s6
+; SI-NEXT: s_add_i32 s4, s5, s7
+; SI-NEXT: s_sext_i32_i16 s4, s4
+; SI-NEXT: s_sext_i32_i16 s5, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i16_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s4, s6
+; GFX9-NEXT: s_add_i32 s1, s5, s7
+; GFX9-NEXT: s_sext_i32_i16 s1, s1
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i16_to_v2i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
%c = add <2 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i32> %c, <i32 16, i32 16>
%ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
@@ -391,6 +1368,69 @@ define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(ptr addrspace(1) %out, <2
; FUNC-LABEL: {{^}}testcase:
define amdgpu_kernel void @testcase(ptr addrspace(1) %out, i8 %a) #0 {
+; SI-LABEL: testcase:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sext_i32_i8 s4, s2
+; SI-NEXT: s_bfe_i32 s5, s2, 0x10000
+; SI-NEXT: s_max_i32 s4, s4, 0
+; SI-NEXT: s_and_b32 s2, s5, s2
+; SI-NEXT: s_xor_b32 s4, s4, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: testcase:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_sext_i32_i8 s0, s2
+; GFX9-NEXT: s_max_i32 s0, s0, 0
+; GFX9-NEXT: s_bitcmp1_b32 s2, 0
+; GFX9-NEXT: s_cselect_b32 s1, s2, 0
+; GFX9-NEXT: s_xor_b32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: testcase:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, 1,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, T0.X,
+; EG-NEXT: MAX_INT * T1.W, PV.Z, 0.0,
+; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: XOR_INT * T0.W, PS, PV.W,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and_a_1 = and i8 %a, 1
%cmp_eq = icmp eq i8 %and_a_1, 0
%cmp_slt = icmp slt i8 %a, 0
@@ -403,6 +1443,69 @@ define amdgpu_kernel void @testcase(ptr addrspace(1) %out, i8 %a) #0 {
; FUNC-LABEL: {{^}}testcase_3:
define amdgpu_kernel void @testcase_3(ptr addrspace(1) %out, i8 %a) #0 {
+; SI-LABEL: testcase_3:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sext_i32_i8 s4, s2
+; SI-NEXT: s_bfe_i32 s5, s2, 0x10000
+; SI-NEXT: s_max_i32 s4, s4, 0
+; SI-NEXT: s_and_b32 s2, s5, s2
+; SI-NEXT: s_xor_b32 s4, s4, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: testcase_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_sext_i32_i8 s0, s2
+; GFX9-NEXT: s_max_i32 s0, s0, 0
+; GFX9-NEXT: s_bitcmp1_b32 s2, 0
+; GFX9-NEXT: s_cselect_b32 s1, s2, 0
+; GFX9-NEXT: s_xor_b32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: testcase_3:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, 1,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, T0.X,
+; EG-NEXT: MAX_INT * T1.W, PV.Z, 0.0,
+; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: XOR_INT * T0.W, PS, PV.W,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and_a_1 = and i8 %a, 1
%cmp_eq = icmp eq i8 %and_a_1, 0
%cmp_slt = icmp slt i8 %a, 0
@@ -419,6 +1522,92 @@ define amdgpu_kernel void @testcase_3(ptr addrspace(1) %out, i8 %a) #0 {
; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 {
+; SI-LABEL: vgpr_sext_in_reg_v4i8_to_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; SI-NEXT: v_bfe_i32 v3, v3, 0, 8
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 8
+; SI-NEXT: v_bfe_i32 v1, v1, 0, 8
+; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: vgpr_sext_in_reg_v4i8_to_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
+; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
+; GFX9-NEXT: v_add_u32_e32 v1, v5, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
+; GFX9-NEXT: v_add_u32_e32 v3, v7, v3
+; GFX9-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: vgpr_sext_in_reg_v4i8_to_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV T0.X, KC0[2].Z,
+; EG-NEXT: MOV * T1.X, KC0[2].W,
+; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: ADD_INT * T0.W, T0.W, T1.W,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.Z, T1.Z,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.Y, T1.Y,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.X, T1.X,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%loada = load <4 x i32>, ptr addrspace(1) %a, align 16
%loadb = load <4 x i32>, ptr addrspace(1) %b, align 16
%c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -432,6 +1621,92 @@ define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out,
; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 {
+; SI-LABEL: vgpr_sext_in_reg_v4i16_to_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; SI-NEXT: v_bfe_i32 v3, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
+; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: vgpr_sext_in_reg_v4i16_to_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
+; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
+; GFX9-NEXT: v_add_u32_e32 v1, v5, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
+; GFX9-NEXT: v_add_u32_e32 v3, v7, v3
+; GFX9-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: vgpr_sext_in_reg_v4i16_to_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV T0.X, KC0[2].Z,
+; EG-NEXT: MOV * T1.X, KC0[2].W,
+; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: ADD_INT * T0.W, T0.W, T1.W,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.Z, T1.Z,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.Y, T1.Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.X, T1.X,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
%loada = load <4 x i32>, ptr addrspace(1) %a, align 16
%loadb = load <4 x i32>, ptr addrspace(1) %b, align 16
%c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -447,6 +1722,86 @@ define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(ptr addrspace(1) %out
; GCN-NOT: bfe
; GCN: buffer_store_short
define amdgpu_kernel void @sext_in_reg_to_illegal_type(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %src) #0 {
+; SI-LABEL: sext_in_reg_to_illegal_type:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_max_i32_e32 v0, 0, v0
+; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: sext_in_reg_to_illegal_type:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_mov_b32 s10, s6
+; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s8, s2
+; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
+; GFX89-NEXT: s_mov_b32 s4, s0
+; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: v_max_i32_e32 v0, 0, v0
+; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_to_illegal_type:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s10, s6
+; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_i32_e32 v0, 0, v0
+; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_to_illegal_type:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
+; EG-NEXT: MAX_INT T0.W, PV.W, 0.0,
+; EG-NEXT: LSHL * T1.W, PS, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tmp5 = load i8, ptr addrspace(1) %src, align 1
%tmp2 = sext i8 %tmp5 to i32
%tmp2.5 = icmp sgt i32 %tmp2, 0
@@ -473,6 +1828,70 @@ define amdgpu_kernel void @sext_in_reg_to_illegal_type(ptr addrspace(1) nocaptur
; SI: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[RESULT_LO]]:[[RESULT_HI]]]
define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, i64 %s.val) #0 {
+; SI-LABEL: v_sext_in_reg_i1_to_i64_move_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 1
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_and_b32_e32 v3, s9, v3
+; SI-NEXT: v_and_b32_e32 v2, s8, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i1_to_i64_move_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT: v_and_b32_e32 v0, s2, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i1_to_i64_move_use:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT * T1.W, PV.W, 0.0, 1,
+; EG-NEXT: AND_INT * T0.Y, PV.W, KC0[3].Z,
+; EG-NEXT: AND_INT T0.X, T1.W, KC0[3].Y,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -503,6 +1922,69 @@ define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(ptr addrspace(1) %ou
; SI: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[RESULT_LO]]:[[RESULT_HI]]]
define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, i64 %s.val) #0 {
+; SI-LABEL: v_sext_in_reg_i32_to_i64_move_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_and_b32_e32 v3, s9, v3
+; SI-NEXT: v_and_b32_e32 v2, s8, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i32_to_i64_move_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT: v_and_b32_e32 v0, s2, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i32_to_i64_move_use:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
+; EG-NEXT: AND_INT T0.X, PV.W, KC0[3].Y,
+; EG-NEXT: ASHR T1.W, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.X, PS, literal.x,
+; EG-NEXT: AND_INT * T0.Y, PV.W, KC0[3].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -529,6 +2011,75 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(ptr addrspace(1) %o
; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
+; SI-LABEL: s_sext_in_reg_i1_i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s2, s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_i32 s4, s2, 0x10000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: s_sext_in_reg_i1_i16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_lshl_b32 s4, s4, 15
+; GFX89-NEXT: s_sext_i32_i16 s4, s4
+; GFX89-NEXT: s_lshr_b32 s4, s4, 15
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_sext_in_reg_i1_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s4, s4, 15
+; GFX9-NEXT: s_sext_i32_i16 s4, s4
+; GFX9-NEXT: s_lshr_b32 s4, s4, 15
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: s_sext_in_reg_i1_i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, 1,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ld = load i32, ptr addrspace(4) %ptr
%in = trunc i32 %ld to i16
%shl = shl i16 %in, 15
@@ -548,6 +2099,77 @@ define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrs
; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
define amdgpu_kernel void @s_sext_in_reg_i2_i16(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
+; SI-LABEL: s_sext_in_reg_i2_i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s2, s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_i32 s4, s2, 0x20000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: s_sext_in_reg_i2_i16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_lshl_b32 s4, s4, 14
+; GFX89-NEXT: s_sext_i32_i16 s4, s4
+; GFX89-NEXT: s_lshr_b32 s4, s4, 14
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_sext_in_reg_i2_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s4, s4, 14
+; GFX9-NEXT: s_sext_i32_i16 s4, s4
+; GFX9-NEXT: s_lshr_b32 s4, s4, 14
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: s_sext_in_reg_i2_i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHL T0.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 30(4.203895e-44), 3(4.203895e-45)
+; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
+; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ld = load i32, ptr addrspace(4) %ptr
%in = trunc i32 %ld to i16
%shl = shl i16 %in, 14
@@ -562,6 +2184,48 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16(ptr addrspace(1) %out, ptr addrs
; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
define amdgpu_kernel void @v_sext_in_reg_i1_i16(ptr addrspace(3) %out, ptr addrspace(1) %ptr) #0 {
+; SI-LABEL: v_sext_in_reg_i1_i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_bfe_i32 v1, v1, 0, 1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_write_b16 v0, v1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i1_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX9-NEXT: ds_write_b16 v0, v1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i1_i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @41, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHL * T0.W, T0.X, 1,
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU 2, @42, KC0[CB0:0-32], KC1[]
+; EG-NEXT: BFE_INT T1.W, T0.X, 0.0, 1,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LDS_SHORT_WRITE * T0.W, T1.W,
+; EG-NEXT: RETURN
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i16, ptr addrspace(1) %ptr, i32 %tid
%out.gep = getelementptr i16, ptr addrspace(3) %out, i32 %tid
@@ -583,6 +2247,64 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16(ptr addrspace(3) %out, ptr addrs
; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}
; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(ptr addrspace(3) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, i16 %s.val) nounwind {
+; SI-LABEL: v_sext_in_reg_i1_i16_nonload:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s12, s[0:1], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
+; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, v1, v2
+; SI-NEXT: v_bfe_i32 v1, v1, 0, 1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_write_b16 v0, v1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i1_i16_nonload:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, v2, v1
+; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX9-NEXT: ds_write_b16 v0, v1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i1_i16_nonload:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @43, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHL * T0.W, T0.X, 1,
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU 0, @44, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
+; EG-NEXT: ALU 5, @45, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T1.W, T1.X, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, T0.X, PV.W,
+; EG-NEXT: BFE_INT T1.W, PV.W, 0.0, 1,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LDS_SHORT_WRITE * T0.W, T1.W,
+; EG-NEXT: RETURN
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
@@ -609,6 +2331,60 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(ptr addrspace(3) %out, p
; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 %in) #0 {
+; SI-LABEL: s_sext_in_reg_i2_i16_arg:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_i32 s4, s2, 0x20000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_sext_in_reg_i2_i16_arg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s0, s2, 14
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 14
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: s_sext_in_reg_i2_i16_arg:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHL T0.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 30(4.203895e-44), 3(4.203895e-45)
+; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
+; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%shl = shl i16 %in, 14
%sext = ashr i16 %shl, 14
store i16 %sext, ptr addrspace(1) %out
@@ -626,6 +2402,58 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 %
; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %in) #0 {
+; SI-LABEL: s_sext_in_reg_i8_i16_arg:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sext_i32_i8 s4, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_sext_in_reg_i8_i16_arg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s0, s2, 8
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: s_sext_in_reg_i8_i16_arg:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%shl = shl i16 %in, 8
%sext = ashr i16 %shl, 8
store i16 %sext, ptr addrspace(1) %out
@@ -643,6 +2471,60 @@ define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %
; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(ptr addrspace(1) %out, i16 %in) #0 {
+; SI-LABEL: s_sext_in_reg_i15_i16_arg:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_i32 s4, s2, 0xf0000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_sext_in_reg_i15_i16_arg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s0, s2, 1
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: s_sext_in_reg_i15_i16_arg:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHL T0.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 17(2.382207e-44), 3(4.203895e-45)
+; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
+; EG-NEXT: 17(2.382207e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%shl = shl i16 %in, 1
%sext = ashr i16 %shl, 1
store i16 %sext, ptr addrspace(1) %out
@@ -654,6 +2536,68 @@ define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(ptr addrspace(1) %out, i16
; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 15, [[ADD]]
; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 15, [[SHL]]
define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i1_to_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s4, s2, 16
+; SI-NEXT: s_lshr_b32 s5, s3, 16
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_add_i32 s4, s4, s5
+; SI-NEXT: s_bfe_i32 s2, s2, 0x10000
+; SI-NEXT: s_bfe_i32 s3, s4, 0x10000
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_lshl_b32 s3, s3, 16
+; SI-NEXT: s_or_b32 s2, s2, s3
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i1_to_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_add_u16 v0, s2, v0
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i1_to_v2i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @14, KC0[], KC1[]
+; EG-NEXT: TEX 3 @6
+; EG-NEXT: ALU 9, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
+; EG-NEXT: VTX_READ_16 T6.X, T4.X, 46, #3
+; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3
+; EG-NEXT: VTX_READ_16 T4.X, T4.X, 44, #3
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: MOV * T4.X, 0.0,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: ADD_INT * T0.W, T5.X, T6.X,
+; EG-NEXT: ADD_INT * T1.W, T7.X, T4.X,
+; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, 1,
+; EG-NEXT: BFE_INT * T0.W, T0.W, 0.0, 1,
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT T4.X, PV.W, PS,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i16> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i16> %c, <i16 15, i16 15>
%ashr = ashr <2 x i16> %shl, <i16 15, i16 15>
@@ -669,6 +2613,94 @@ define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(ptr addrspace(1) %out, <2 x
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(ptr addrspace(1) %out, <3 x i16> %a, <3 x i16> %b) #0 {
+; SI-LABEL: sext_in_reg_v3i1_to_v3i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s8, s4, 16
+; SI-NEXT: s_lshr_b32 s9, s6, 16
+; SI-NEXT: s_add_i32 s5, s5, s7
+; SI-NEXT: s_add_i32 s4, s4, s6
+; SI-NEXT: s_add_i32 s8, s8, s9
+; SI-NEXT: s_bfe_i32 s4, s4, 0x10000
+; SI-NEXT: s_bfe_i32 s5, s5, 0x10000
+; SI-NEXT: s_bfe_i32 s6, s8, 0x10000
+; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: s_lshl_b32 s5, s6, 16
+; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v3i1_to_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: v_pk_add_u16 v0, s5, v0
+; GFX9-NEXT: v_pk_add_u16 v1, s4, v1
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, 15, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v1, 15, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v3i1_to_v3i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @18, KC0[], KC1[]
+; EG-NEXT: TEX 5 @6
+; EG-NEXT: ALU 25, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T8.X, 0
+; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X
+; EG-NEXT: CF_END
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
+; EG-NEXT: VTX_READ_16 T7.X, T5.X, 52, #3
+; EG-NEXT: VTX_READ_16 T8.X, T5.X, 46, #3
+; EG-NEXT: VTX_READ_16 T9.X, T5.X, 54, #3
+; EG-NEXT: VTX_READ_16 T10.X, T5.X, 48, #3
+; EG-NEXT: VTX_READ_16 T5.X, T5.X, 56, #3
+; EG-NEXT: ALU clause starting at 18:
+; EG-NEXT: MOV * T5.X, 0.0,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T10.X, T5.X,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T1.W, PS, 0.0, 1,
+; EG-NEXT: AND_INT * T2.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T2.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T5.X, PV.W, PS,
+; EG-NEXT: LSHL * T5.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T5.Y, 0.0,
+; EG-NEXT: MOV T5.Z, 0.0,
+; EG-NEXT: ADD_INT * T1.W, T8.X, T9.X,
+; EG-NEXT: ADD_INT * T2.W, T6.X, T7.X,
+; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, 1,
+; EG-NEXT: BFE_INT * T1.W, T1.W, 0.0, 1,
+; EG-NEXT: LSHR T6.X, T0.W, literal.x,
+; EG-NEXT: AND_INT T0.W, PV.W, literal.y,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.z,
+; EG-NEXT: 2(2.802597e-45), -65536(nan)
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: OR_INT T7.X, PV.W, PS,
+; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <3 x i16> %a, %b ; add to prevent folding into extload
%shl = shl <3 x i16> %c, <i16 15, i16 15, i16 15>
%ashr = ashr <3 x i16> %shl, <i16 15, i16 15, i16 15>
@@ -681,6 +2713,72 @@ define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(ptr addrspace(1) %out, <3 x
; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 14, [[ADD]]
; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 14, [[SHL]]
define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i2_to_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s4, s2, 16
+; SI-NEXT: s_lshr_b32 s5, s3, 16
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_add_i32 s4, s4, s5
+; SI-NEXT: s_bfe_i32 s2, s2, 0x20000
+; SI-NEXT: s_bfe_i32 s3, s4, 0x20000
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_lshl_b32 s3, s3, 16
+; SI-NEXT: s_or_b32 s2, s2, s3
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i2_to_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_add_u16 v0, s2, v0
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 14, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_pk_ashrrev_i16 v0, 14, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i2_to_v2i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @14, KC0[], KC1[]
+; EG-NEXT: TEX 3 @6
+; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T5.X, T4.X, 40, #3
+; EG-NEXT: VTX_READ_16 T6.X, T4.X, 44, #3
+; EG-NEXT: VTX_READ_16 T7.X, T4.X, 42, #3
+; EG-NEXT: VTX_READ_16 T4.X, T4.X, 46, #3
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: MOV * T4.X, 0.0,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: ADD_INT * T0.W, T5.X, T6.X,
+; EG-NEXT: ADD_INT * T1.W, T7.X, T4.X,
+; EG-NEXT: LSHL T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.W, literal.x,
+; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.W, PS, literal.x,
+; EG-NEXT: ASHR * T1.W, PV.W, literal.x,
+; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, PS, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
+; EG-NEXT: OR_INT T4.X, PV.W, PS,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i16> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i16> %c, <i16 14, i16 14>
%ashr = ashr <2 x i16> %shl, <i16 14, i16 14>
@@ -693,6 +2791,69 @@ define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(ptr addrspace(1) %out, <2 x
; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 8, [[ADD]]
; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 8, [[SHL]]
define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i8_to_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s4, s2, 16
+; SI-NEXT: s_lshr_b32 s5, s3, 16
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_add_i32 s4, s4, s5
+; SI-NEXT: s_sext_i32_i8 s2, s2
+; SI-NEXT: s_sext_i32_i8 s3, s4
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_lshl_b32 s3, s3, 16
+; SI-NEXT: s_or_b32 s2, s2, s3
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i8_to_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_add_u16 v0, s2, v0
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i8_to_v2i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @14, KC0[], KC1[]
+; EG-NEXT: TEX 3 @6
+; EG-NEXT: ALU 10, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
+; EG-NEXT: VTX_READ_16 T6.X, T4.X, 46, #3
+; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3
+; EG-NEXT: VTX_READ_16 T4.X, T4.X, 44, #3
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: MOV * T4.X, 0.0,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: ADD_INT * T0.W, T5.X, T6.X,
+; EG-NEXT: ADD_INT * T1.W, T7.X, T4.X,
+; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.y,
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
+; EG-NEXT: OR_INT T4.X, PV.W, PS,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i16> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i16> %c, <i16 8, i16 8>
%ashr = ashr <2 x i16> %shl, <i16 8, i16 8>
@@ -708,6 +2869,95 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(ptr addrspace(1) %out, <2 x
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(ptr addrspace(1) %out, <3 x i16> %a, <3 x i16> %b) #0 {
+; SI-LABEL: sext_in_reg_v3i8_to_v3i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s8, s4, 16
+; SI-NEXT: s_lshr_b32 s9, s6, 16
+; SI-NEXT: s_add_i32 s5, s5, s7
+; SI-NEXT: s_add_i32 s4, s4, s6
+; SI-NEXT: s_add_i32 s8, s8, s9
+; SI-NEXT: s_sext_i32_i8 s4, s4
+; SI-NEXT: s_sext_i32_i8 s5, s5
+; SI-NEXT: s_sext_i32_i8 s6, s8
+; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: s_lshl_b32 s5, s6, 16
+; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v3i8_to_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: v_pk_add_u16 v0, s5, v0
+; GFX9-NEXT: v_pk_add_u16 v1, s4, v1
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v3i8_to_v3i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @18, KC0[], KC1[]
+; EG-NEXT: TEX 5 @6
+; EG-NEXT: ALU 26, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T8.X, 0
+; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X
+; EG-NEXT: CF_END
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
+; EG-NEXT: VTX_READ_16 T7.X, T5.X, 52, #3
+; EG-NEXT: VTX_READ_16 T8.X, T5.X, 46, #3
+; EG-NEXT: VTX_READ_16 T9.X, T5.X, 54, #3
+; EG-NEXT: VTX_READ_16 T10.X, T5.X, 48, #3
+; EG-NEXT: VTX_READ_16 T5.X, T5.X, 56, #3
+; EG-NEXT: ALU clause starting at 18:
+; EG-NEXT: MOV * T5.X, 0.0,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T10.X, T5.X,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x,
+; EG-NEXT: AND_INT * T2.W, PV.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T2.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T5.X, PV.W, PS,
+; EG-NEXT: LSHL * T5.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T5.Y, 0.0,
+; EG-NEXT: MOV T5.Z, 0.0,
+; EG-NEXT: ADD_INT * T1.W, T8.X, T9.X,
+; EG-NEXT: ADD_INT * T2.W, T6.X, T7.X,
+; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T1.W, T1.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T6.X, T0.W, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, literal.y,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.z,
+; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: OR_INT T7.X, PV.W, PS,
+; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <3 x i16> %a, %b ; add to prevent folding into extload
%shl = shl <3 x i16> %c, <i16 8, i16 8, i16 8>
%ashr = ashr <3 x i16> %shl, <i16 8, i16 8, i16 8>
@@ -719,3 +2969,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
index 1164d3bd770887..d55e201394a318 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN %s
@@ -9,6 +10,24 @@
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @lshr_i64_35(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: lshr_i64_35:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 3, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = lshr i64 %val, 35
store i64 %shl, ptr addrspace(1) %out
@@ -21,6 +40,24 @@ define amdgpu_kernel void @lshr_i64_35(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @lshr_i64_63(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: lshr_i64_63:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = lshr i64 %val, 63
store i64 %shl, ptr addrspace(1) %out
@@ -33,6 +70,24 @@ define amdgpu_kernel void @lshr_i64_63(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @lshr_i64_33(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: lshr_i64_33:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = lshr i64 %val, 33
store i64 %shl, ptr addrspace(1) %out
@@ -44,6 +99,23 @@ define amdgpu_kernel void @lshr_i64_33(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @lshr_i64_32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: lshr_i64_32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = lshr i64 %val, 32
store i64 %shl, ptr addrspace(1) %out
@@ -59,6 +131,24 @@ define amdgpu_kernel void @lshr_i64_32(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[LO]], 8, 23
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
define amdgpu_kernel void @lshr_and_i64_35(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: lshr_and_i64_35:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_bfe_u32 v0, v0, 8, 23
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff
%shl = lshr i64 %and, 40
@@ -74,6 +164,24 @@ define amdgpu_kernel void @lshr_and_i64_35(ptr addrspace(1) %out, ptr addrspace(
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @shl_i64_const_35(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: shl_i64_const_35:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 35
store i64 %shl, ptr addrspace(1) %out
@@ -85,6 +193,23 @@ define amdgpu_kernel void @shl_i64_const_35(ptr addrspace(1) %out, ptr addrspace
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @shl_i64_const_32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: shl_i64_const_32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 32
store i64 %shl, ptr addrspace(1) %out
@@ -97,6 +222,24 @@ define amdgpu_kernel void @shl_i64_const_32(ptr addrspace(1) %out, ptr addrspace
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @shl_i64_const_63(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: shl_i64_const_63:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 31, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 63
store i64 %shl, ptr addrspace(1) %out
@@ -107,6 +250,23 @@ define amdgpu_kernel void @shl_i64_const_63(ptr addrspace(1) %out, ptr addrspace
; GCN-LABEL: {{^}}ashr_i64_const_32:
define amdgpu_kernel void @ashr_i64_const_32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: ashr_i64_const_32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = ashr i64 %val, 32
store i64 %shl, ptr addrspace(1) %out
@@ -115,6 +275,24 @@ define amdgpu_kernel void @ashr_i64_const_32(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: {{^}}ashr_i64_const_63:
define amdgpu_kernel void @ashr_i64_const_63(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: ashr_i64_const_63:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = ashr i64 %val, 63
store i64 %shl, ptr addrspace(1) %out
@@ -126,6 +304,23 @@ define amdgpu_kernel void @ashr_i64_const_63(ptr addrspace(1) %out, ptr addrspac
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 31, [[VAL]]
; GCN: buffer_store_dword [[SHL]]
define amdgpu_kernel void @trunc_shl_31_i32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_31_i32_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 31, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 31
%trunc = trunc i64 %shl to i32
@@ -138,6 +333,23 @@ define amdgpu_kernel void @trunc_shl_31_i32_i64(ptr addrspace(1) %out, ptr addrs
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
; GCN: buffer_store_short [[SHL]]
define amdgpu_kernel void @trunc_shl_15_i16_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_15_i16_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 15, v0
+; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 15
%trunc = trunc i64 %shl to i16
@@ -150,6 +362,23 @@ define amdgpu_kernel void @trunc_shl_15_i16_i64(ptr addrspace(1) %out, ptr addrs
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
; GCN: buffer_store_short [[SHL]]
define amdgpu_kernel void @trunc_shl_15_i16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_15_i16_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 15, v0
+; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in
%shl = shl i32 %val, 15
%trunc = trunc i32 %shl to i16
@@ -162,6 +391,23 @@ define amdgpu_kernel void @trunc_shl_15_i16_i32(ptr addrspace(1) %out, ptr addrs
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 7, [[VAL]]
; GCN: buffer_store_byte [[SHL]]
define amdgpu_kernel void @trunc_shl_7_i8_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_7_i8_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 7
%trunc = trunc i64 %shl to i8
@@ -175,6 +421,24 @@ define amdgpu_kernel void @trunc_shl_7_i8_i64(ptr addrspace(1) %out, ptr addrspa
; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 2, [[SHL]]
; GCN: buffer_store_byte [[AND]]
define amdgpu_kernel void @trunc_shl_1_i2_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_1_i2_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN-NEXT: v_and_b32_e32 v0, 2, v0
+; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 1
%trunc = trunc i64 %shl to i2
@@ -187,6 +451,23 @@ define amdgpu_kernel void @trunc_shl_1_i2_i64(ptr addrspace(1) %out, ptr addrspa
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
; GCN: buffer_store_dword [[SHL]]
define amdgpu_kernel void @trunc_shl_1_i32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_1_i32_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 1
%trunc = trunc i64 %shl to i32
@@ -199,6 +480,23 @@ define amdgpu_kernel void @trunc_shl_1_i32_i64(ptr addrspace(1) %out, ptr addrsp
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[VAL]]
; GCN: buffer_store_dword [[SHL]]
define amdgpu_kernel void @trunc_shl_16_i32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_16_i32_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 16
%trunc = trunc i64 %shl to i32
@@ -210,6 +508,15 @@ define amdgpu_kernel void @trunc_shl_16_i32_i64(ptr addrspace(1) %out, ptr addrs
; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword [[ZERO]]
define amdgpu_kernel void @trunc_shl_33_i32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_33_i32_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 33
%trunc = trunc i64 %shl to i32
@@ -223,6 +530,24 @@ define amdgpu_kernel void @trunc_shl_33_i32_i64(ptr addrspace(1) %out, ptr addrs
; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]]
; GCN: buffer_store_dwordx2 v[[[RESLO]]:[[RESHI]]]
define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_16_v2i32_v2i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load <2 x i64>, ptr addrspace(1) %in
%shl = shl <2 x i64> %val, <i64 16, i64 16>
%trunc = trunc <2 x i64> %shl to <2 x i32>
@@ -236,6 +561,26 @@ define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(ptr addrspace(1) %out, ptr a
; GCN: buffer_store_dword v[[RESLO]]
; GCN: buffer_store_dwordx2 v[[[RESLO]]:[[RESHI]]]
define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_31_i32_i64_multi_use:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 31
%trunc = trunc i64 %shl to i32
@@ -249,6 +594,22 @@ define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(ptr addrspace(1) %out,
; GCN-NOT: v_lshl_b64
; GCN-NOT: v_lshlrev_b64
define amdgpu_kernel void @trunc_shl_and31(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: trunc_shl_and31:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s8, s[0:1], 0xd
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s0, s6
+; GCN-NEXT: s_mov_b32 s1, s7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, s8, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
bb:
%tmp = load i64, ptr addrspace(1) %arg, align 8
%tmp3 = and i32 %arg2, 31
@@ -265,6 +626,23 @@ bb:
; GCN-NOT: v_lshl_b64
; GCN-NOT: v_lshlrev_b64
define amdgpu_kernel void @trunc_shl_and30(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: trunc_shl_and30:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s8, s[0:1], 0xd
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_and_b32 s4, s8, 30
+; GCN-NEXT: s_mov_b32 s0, s6
+; GCN-NEXT: s_mov_b32 s1, s7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
bb:
%tmp = load i64, ptr addrspace(1) %arg, align 8
%tmp3 = and i32 %arg2, 30
@@ -279,6 +657,22 @@ bb:
; Negative test, wrong constant
; GCN: v_lshl_b64
define amdgpu_kernel void @trunc_shl_wrong_and63(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: trunc_shl_wrong_and63:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s8, s[0:1], 0xd
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s0, s6
+; GCN-NEXT: s_mov_b32 s1, s7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], s8
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
bb:
%tmp = load i64, ptr addrspace(1) %arg, align 8
%tmp3 = and i32 %arg2, 63
@@ -293,6 +687,22 @@ bb:
; Negative test, shift can be full 64 bit
; GCN: v_lshl_b64
define amdgpu_kernel void @trunc_shl_no_and(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: trunc_shl_no_and:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s8, s[0:1], 0xd
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s0, s6
+; GCN-NEXT: s_mov_b32 s1, s7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], s8
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
bb:
%tmp = load i64, ptr addrspace(1) %arg, align 8
%tmp4 = zext i32 %arg2 to i64
@@ -308,6 +718,23 @@ bb:
; GCN-DAG: v_lshl_b64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 5
; GCN-DAG: v_lshl_b64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 6
define amdgpu_kernel void @trunc_shl_vec_vec(ptr addrspace(1) %arg) {
+; GCN-LABEL: trunc_shl_vec_vec:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 6
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], 4
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], 3
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 5
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NEXT: s_endpgm
bb:
%v = load <4 x i64>, ptr addrspace(1) %arg, align 32
%shl = shl <4 x i64> %v, <i64 3, i64 4, i64 5, i64 6>
diff --git a/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll b/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
index 0f60790ff396fc..7df9ff34f4feec 100644
--- a/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck %s
; Check transformation shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
@@ -10,6 +11,21 @@
; CHECK: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADD]]
; CHECK: load_dword v{{[0-9]+}}, v[[[ADDRLO]]:
define amdgpu_kernel void @add_const_offset(ptr addrspace(1) nocapture %arg) {
+; CHECK-LABEL: add_const_offset:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NEXT: v_add_u32_e32 v0, vcc, 0xc80, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: flat_load_dword v2, v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%add = add i32 %id, 200
@@ -27,6 +43,21 @@ bb:
; CHECK: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[OR]]
; CHECK: load_dword v{{[0-9]+}}, v[[[ADDRLO]]:
define amdgpu_kernel void @or_const_offset(ptr addrspace(1) nocapture %arg) {
+; CHECK-LABEL: or_const_offset:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NEXT: v_or_b32_e32 v0, 0x1000, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: flat_load_dword v2, v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%add = or i32 %id, 256
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 9127cc3ffb34ee..2968a63b150ad2 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
@@ -314,6 +315,18 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32
; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64
define void @shl_add_ptr_combine_2use_lds(i32 %idx) #0 {
+; GCN-LABEL: shl_add_ptr_combine_2use_lds:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 9
+; GCN-NEXT: s_mov_b32 m0, -1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT: ds_write_b32 v1, v2 offset:32
+; GCN-NEXT: v_mov_b32_e32 v1, 10
+; GCN-NEXT: ds_write_b32 v0, v1 offset:64
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
%shl1 = shl i32 %idx.add, 4
@@ -364,6 +377,18 @@ define void @shl_add_ptr_combine_2use_both_max_lds_offset(i32 %idx) #0 {
; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16
; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen offset:32
define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 {
+; GCN-LABEL: shl_add_ptr_combine_2use_private:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 9
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, 10
+; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%idx = zext i16 %idx.arg to i32
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 2
@@ -414,6 +439,19 @@ define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.a
; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8
; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 {
+; GCN-LABEL: shl_or_ptr_combine_2use_lds:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 9
+; GCN-NEXT: s_mov_b32 m0, -1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GCN-NEXT: ds_write_b32 v0, v2 offset:8
+; GCN-NEXT: v_mov_b32_e32 v0, 10
+; GCN-NEXT: ds_write_b32 v1, v0 offset:16
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%idx.shl = shl i32 %idx, 1
%idx.add = or i32 %idx.shl, 1
%shl0 = shl i32 %idx.add, 3
@@ -431,6 +469,19 @@ define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 {
; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}}{{$}}
; GCN-DAG: ds_write_b32 [[SCALE1]], v{{[0-9]+}}{{$}}
define void @shl_or_ptr_not_combine_2use_lds(i32 %idx) #0 {
+; GCN-LABEL: shl_or_ptr_not_combine_2use_lds:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v0, 1, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 9
+; GCN-NEXT: s_mov_b32 m0, -1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT: ds_write_b32 v1, v2
+; GCN-NEXT: v_mov_b32_e32 v1, 10
+; GCN-NEXT: ds_write_b32 v0, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%idx.add = or i32 %idx, 1
%shl0 = shl i32 %idx.add, 3
%shl1 = shl i32 %idx.add, 4
diff --git a/llvm/test/CodeGen/AMDGPU/store-private.ll b/llvm/test/CodeGen/AMDGPU/store-private.ll
index 1c4ac88c9ed398..8e2d464bad2ddf 100644
--- a/llvm/test/CodeGen/AMDGPU/store-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-private.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -16,6 +17,47 @@
; SI: buffer_store_byte
define amdgpu_kernel void @store_i1(ptr addrspace(5) %out) {
+; EG-LABEL: store_i1:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 15, @0, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T2.W, T0.X, PV.W,
+; EG-NEXT: LSHL * T1.W, 1, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i1:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 15, @0, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: AND_INT T0.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T1.W, 1, T1.W,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
store i1 true, ptr addrspace(5) %out
ret void
@@ -47,6 +89,55 @@ entry:
; SI: buffer_store_byte
define amdgpu_kernel void @store_i8(ptr addrspace(5) %out, i8 %in) {
+; EG-LABEL: store_i8:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 4, @1, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: MOV * T1.X, 0.0,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_8 T1.X, T1.X, 40, #3
+; EG-NEXT: ALU 11, @2, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T2.W, T0.X, PV.W,
+; EG-NEXT: LSHL * T1.W, T1.X, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i8:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 4, @1, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T0.X, T(0 + AR.x).X+,
+; CM-NEXT: MOV * T1.X, 0.0,
+; CM-NEXT: TEX 0 @0
+; CM-NEXT: VTX_READ_8 T1.X, T1.X, 40, #3
+; CM-NEXT: ALU 11, @2, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: AND_INT T0.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T1.W, T1.X, T1.W, BS:VEC_120/SCL_212
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
store i8 %in, ptr addrspace(5) %out
ret void
@@ -75,6 +166,58 @@ entry:
; SI: buffer_store_short
define amdgpu_kernel void @store_i16(ptr addrspace(5) %out, i16 %in) {
+; EG-LABEL: store_i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 4, @3, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: MOV * T1.X, 0.0,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_16 T1.X, T1.X, 40, #3
+; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, T1.X, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T2.W, T0.X, PV.W,
+; EG-NEXT: LSHL * T1.W, T3.W, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i16:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 4, @3, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T0.X, T(0 + AR.x).X+,
+; CM-NEXT: MOV * T1.X, 0.0,
+; CM-NEXT: TEX 0 @0
+; CM-NEXT: VTX_READ_16 T1.X, T1.X, 40, #3
+; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, T1.X, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T1.W, PV.Z, T1.W,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
store i16 %in, ptr addrspace(5) %out
ret void
@@ -105,6 +248,92 @@ entry:
; CM: MOVA_INT
; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
define amdgpu_kernel void @store_i24(ptr addrspace(5) %out, i24 %in) {
+; EG-LABEL: store_i24:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 37, @5, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.X, PV.W,
+; EG-NEXT: LSHL T1.W, T3.W, T1.W,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: MOV * T2.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T2.W, KC0[2].Z, literal.x, PS,
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, T0.X, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, T0.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i24:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 38, @5, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[2].Z, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.X, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T1.W,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T0.Z, literal.x, PV.W,
+; CM-NEXT: MOV * T1.W, literal.y,
+; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; CM-NEXT: BFE_UINT T2.Z, KC0[2].Z, literal.x, PV.W,
+; CM-NEXT: NOT_INT * T1.W, PV.Z,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
store i24 %in, ptr addrspace(5) %out
ret void
@@ -123,6 +352,25 @@ entry:
; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
; CM-NOT: MOVA_INT
define amdgpu_kernel void @store_i25(ptr addrspace(5) %out, i25 %in) {
+; EG-LABEL: store_i25:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 4, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: LSHR * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 33554431(9.403954e-38), 2(2.802597e-45)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PS,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i25:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 4, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT T0.Z, KC0[2].Z, literal.x,
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 33554431(9.403954e-38), 2(2.802597e-45)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; CM-NEXT: RETURN
entry:
store i25 %in, ptr addrspace(5) %out
ret void
@@ -144,6 +392,59 @@ entry:
; SI: buffer_store_short
define amdgpu_kernel void @store_v2i8(ptr addrspace(5) %out, <2 x i32> %in) {
+; EG-LABEL: store_v2i8:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 21, @7, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.Z, KC0[3].X, literal.x,
+; EG-NEXT: AND_INT T2.W, KC0[2].W, literal.y,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.z,
+; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T3.W, literal.x, PS,
+; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T3.W, PV.W,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T3.W, T0.Y, PS,
+; EG-NEXT: LSHL * T1.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v2i8:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 21, @7, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T1.Y, KC0[3].X, literal.x,
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.y,
+; CM-NEXT: LSHL * T1.W, PV.W, literal.z,
+; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43)
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.Z,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T1.W, PV.Z, T1.W,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <2 x i32> %in to <2 x i8>
store <2 x i8> %0, ptr addrspace(5) %out
@@ -175,6 +476,87 @@ entry:
; SI: buffer_store_byte
define amdgpu_kernel void @store_v2i8_unaligned(ptr addrspace(5) %out, <2 x i32> %in) {
+; EG-LABEL: store_v2i8_unaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 34, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T1.W, T3.W, T1.W,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, 1,
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T2.W, KC0[3].X, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: AND_INT T1.W, T0.Y, PV.W,
+; EG-NEXT: LSHL * T0.W, T2.W, T0.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v2i8_unaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 36, @8, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T1.W,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, 1,
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[3].X, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T2.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <2 x i32> %in to <2 x i8>
store <2 x i8> %0, ptr addrspace(5) %out, align 1
@@ -194,6 +576,31 @@ entry:
; SI: buffer_store_dword
define amdgpu_kernel void @store_v2i16(ptr addrspace(5) %out, <2 x i32> %in) {
+; EG-LABEL: store_v2i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHL T0.W, KC0[3].X, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
+; EG-NEXT: OR_INT T0.W, PV.W, PS,
+; EG-NEXT: LSHR * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PS,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v2i16:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHL T0.Z, KC0[3].X, literal.x,
+; CM-NEXT: AND_INT * T0.W, KC0[2].W, literal.y,
+; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
+; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; CM-NEXT: RETURN
entry:
%0 = trunc <2 x i32> %in to <2 x i16>
store <2 x i16> %0, ptr addrspace(5) %out
@@ -226,6 +633,89 @@ entry:
; SI: buffer_store_short
; SI: buffer_store_short
define amdgpu_kernel void @store_v2i16_unaligned(ptr addrspace(5) %out, <2 x i32> %in) {
+; EG-LABEL: store_v2i16_unaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 35, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].W, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T1.W, T3.W, T1.W,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T2.W, KC0[3].X, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: AND_INT T1.W, T0.Y, PV.W,
+; EG-NEXT: LSHL * T0.W, T2.W, T0.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v2i16_unaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T1.W,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[3].X, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T2.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <2 x i32> %in to <2 x i16>
store <2 x i16> %0, ptr addrspace(5) %out, align 2
@@ -243,6 +733,51 @@ entry:
; SI: buffer_store_dword
define amdgpu_kernel void @store_v4i8(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i8:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 17, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[3].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, KC0[3].Z, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, literal.y,
+; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.z,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.W, PS, PV.W,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.W, PV.W, PS,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.W, PV.W, PS,
+; EG-NEXT: LSHR * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PS,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i8:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 17, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[3].W, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, KC0[3].Z, literal.x,
+; CM-NEXT: LSHL T0.Z, PV.W, literal.y,
+; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.z,
+; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
+; CM-NEXT: LSHL * T0.W, PV.Y, literal.x,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
+; CM-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; CM-NEXT: RETURN
entry:
%0 = trunc <4 x i32> %in to <4 x i8>
store <4 x i8> %0, ptr addrspace(5) %out
@@ -302,6 +837,182 @@ entry:
; SI: buffer_store_byte
; SI-NOT: buffer_store_dword
define amdgpu_kernel void @store_v4i8_unaligned(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i8_unaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 81, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR * T1.W, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[4].X, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T0.W, T3.W, T0.W,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T2.W, KC0[3].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T0.W, T2.W, T0.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, 1,
+; EG-NEXT: LSHR T4.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[3].Z, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: AND_INT T1.W, T0.Y, PV.W,
+; EG-NEXT: LSHL * T0.W, T3.W, T0.W,
+; EG-NEXT: LSHR T5.W, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: LSHL T0.W, T3.W, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT T0.W, PS, PV.W,
+; EG-NEXT: LSHL * T1.W, T2.W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 16(2.242078e-44)
+; EG-NEXT: OR_INT T1.Z, PV.W, PS,
+; EG-NEXT: AND_INT T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT: LSHL T2.W, literal.x, PS,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T0.Y, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i8_unaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 84, @12, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR * T1.W, PV.W, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[4].X, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[3].W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T2.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, 1,
+; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, KC0[3].Z, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T2.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: LSHR T4.Z, KC0[2].Y, literal.x,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T4.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: LSHL T1.Z, T1.Z, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.y,
+; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; CM-NEXT: AND_INT T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: OR_INT T1.Z, PV.W, PV.Z,
+; CM-NEXT: LSHL * T0.W, T0.Z, literal.y,
+; CM-NEXT: 3(4.203895e-45), 16(2.242078e-44)
+; CM-NEXT: OR_INT T2.Y, PV.Z, PV.W,
+; CM-NEXT: AND_INT T0.Z, KC0[3].Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.Y, literal.y,
+; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.Z,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T4.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <4 x i32> %in to <4 x i8>
store <4 x i8> %0, ptr addrspace(5) %out, align 1
@@ -413,6 +1124,440 @@ entry:
; SI: buffer_store_byte
; SI-NOT: buffer_store_dword
define amdgpu_kernel void @store_v8i8_unaligned(ptr addrspace(5) %out, <8 x i32> %in) {
+; EG-LABEL: store_v8i8_unaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 106, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.Y, T3.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[5].X, literal.y,
+; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T2.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[6].X, literal.y,
+; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: AND_INT * T0.W, KC0[4].Z, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: -65281(nan), 8(1.121039e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: AND_INT * T0.W, KC0[5].Z, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: -65281(nan), 8(1.121039e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: AND_INT * T0.W, KC0[4].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: AND_INT * T0.W, KC0[5].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T3.X,
+; EG-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; EG-NEXT: AND_INT T0.W, KC0[4].Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: -256(nan), 255(3.573311e-43)
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T3.X, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV T0.Z, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T3.W, literal.x, PV.W,
+; EG-NEXT: LSHR * T4.W, T0.Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT: NOT_INT * T3.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.Z, PV.W,
+; EG-NEXT: LSHL T1.W, T4.W, T1.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T4.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T0.Z, T2.X,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV T1.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, T3.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: MOV * T3.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T5.W, T0.Y, literal.x, PS,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.Y, PS,
+; EG-NEXT: LSHL T1.W, PV.W, T1.W,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, 1,
+; EG-NEXT: LSHR T5.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
+; EG-NEXT: MOV T1.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, T2.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T4.W, T0.Y, literal.x, T3.W,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T1.Y, PS,
+; EG-NEXT: LSHL * T1.W, PV.W, T1.W,
+; EG-NEXT: ALU 102, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR T4.W, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T1.W, T2.W, T1.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T0.W, T0.W, T1.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: LSHR * T4.W, T0.Z, literal.y,
+; EG-NEXT: 255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T0.W, T4.W, T0.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T4.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T2.W, T0.Z, literal.x, T3.W,
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T0.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T3.W, T0.Z, literal.x, T3.W,
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T0.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T0.Z, literal.x,
+; EG-NEXT: AND_INT T1.W, KC0[5].Y, literal.y,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.z,
+; EG-NEXT: -256(nan), 255(3.573311e-43)
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PS,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T0.Y, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, T0.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v8i8_unaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 107, @13, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[5].X, literal.y,
+; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T3.X, PV.W,
+; CM-NEXT: MOV * T0.Y, T2.X,
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[6].X, literal.y,
+; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T2.X, PV.W,
+; CM-NEXT: MOV T0.Y, T3.X,
+; CM-NEXT: AND_INT * T0.W, KC0[4].Z, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
+; CM-NEXT: -65281(nan), 8(1.121039e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T3.X, PV.W,
+; CM-NEXT: MOV T0.Y, T2.X,
+; CM-NEXT: AND_INT * T0.W, KC0[5].Z, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
+; CM-NEXT: -65281(nan), 8(1.121039e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T2.X, PV.W,
+; CM-NEXT: MOV T0.Y, T3.X,
+; CM-NEXT: AND_INT * T0.W, KC0[4].W, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
+; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T3.X, PV.W,
+; CM-NEXT: MOV T0.Y, T2.X,
+; CM-NEXT: AND_INT * T0.W, KC0[5].W, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
+; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T2.X, PV.W,
+; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: AND_INT T1.Y, PV.Y, literal.x,
+; CM-NEXT: AND_INT T0.Z, KC0[4].Y, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: -256(nan), 255(3.573311e-43)
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOV * T3.X, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.Z, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, T0.Y, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Y, T0.Z, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOV * T0.Z, T2.X,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV T1.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: MOV * T2.W, literal.y,
+; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; CM-NEXT: BFE_UINT T3.Z, T0.Y, literal.x, PV.W,
+; CM-NEXT: NOT_INT * T3.W, PV.Z,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Y, T1.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, 1,
+; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T3.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T3.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV T1.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T3.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: BFE_UINT T1.Z, T0.Y, literal.x, T2.W,
+; CM-NEXT: NOT_INT * T3.W, PV.W,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T2.Z, T1.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: ALU 104, @14, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR T1.Z, KC0[2].Y, literal.x,
+; CM-NEXT: OR_INT * T0.W, T2.Z, T0.W,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T3.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T2.Z, T1.W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T2.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 7(9.809089e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, T0.Z, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: BFE_UINT T1.Z, T0.Z, literal.x, T2.W,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 5(7.006492e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: BFE_UINT T1.Z, T0.Z, literal.x, T2.W,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Y, T0.Z, literal.x,
+; CM-NEXT: AND_INT T0.Z, KC0[5].Y, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.z,
+; CM-NEXT: -256(nan), 255(3.573311e-43)
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.Z,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <8 x i32> %in to <8 x i8>
store <8 x i8> %0, ptr addrspace(5) %out, align 1
@@ -446,6 +1591,109 @@ entry:
; SI: buffer_store_short
; SI-NOT: buffer_store_dword
define amdgpu_kernel void @store_v4i8_halfaligned(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i8_halfaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 46, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T2.W, KC0[3].Z, literal.y,
+; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.Z, literal.x, PV.W,
+; EG-NEXT: LSHL T2.W, T2.W, literal.y,
+; EG-NEXT: AND_INT * T3.W, KC0[3].Y, literal.z,
+; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: OR_INT T3.W, PV.W, PS,
+; EG-NEXT: NOT_INT * T4.W, PV.Z,
+; EG-NEXT: AND_INT T0.Z, T0.Y, PS,
+; EG-NEXT: LSHL T1.W, PV.W, T1.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T4.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT T0.Z, T3.W, literal.x,
+; EG-NEXT: AND_INT T0.W, KC0[3].W, literal.y,
+; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.z,
+; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T1.Z, PS, T2.W,
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
+; EG-NEXT: LSHL T2.W, literal.x, PS,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T0.Y, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i8_halfaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 46, @15, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[3].Z, literal.x,
+; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
+; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; CM-NEXT: LSHL T1.Y, literal.x, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, literal.y,
+; CM-NEXT: AND_INT * T2.W, KC0[3].Y, literal.z,
+; CM-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: OR_INT T1.Z, PV.Z, PV.W,
+; CM-NEXT: NOT_INT * T2.W, PV.Y,
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T1.W,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT T1.Y, T1.W, literal.x,
+; CM-NEXT: AND_INT T1.Z, KC0[3].W, literal.y,
+; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.z,
+; CM-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT T2.Y, PV.W, T0.Z,
+; CM-NEXT: LSHL T0.Z, PV.Z, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.Y, literal.y,
+; CM-NEXT: 16(2.242078e-44), 3(4.203895e-45)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: LSHR T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.Z,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <4 x i32> %in to <4 x i8>
store <4 x i8> %0, ptr addrspace(5) %out, align 2
@@ -463,6 +1711,25 @@ entry:
; SI: buffer_store_dword
define amdgpu_kernel void @store_f32(ptr addrspace(5) %out, float %in) {
+; EG-LABEL: store_f32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_f32:
+; CM: ; %bb.0:
+; CM-NEXT: ALU 4, @16, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR T0.Z, KC0[2].Y, literal.x,
+; CM-NEXT: MOV * T0.W, KC0[2].Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
store float %in, ptr addrspace(5) %out
ret void
}
@@ -483,6 +1750,83 @@ define amdgpu_kernel void @store_f32(ptr addrspace(5) %out, float %in) {
; SI: buffer_store_dword
; SI: buffer_store_dword
define amdgpu_kernel void @store_v4i16(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 33, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.Y, T3.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV * T3.X, PV.W,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[3].W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T2.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[3].Z, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV * T2.X, PV.W,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; EG-NEXT: AND_INT T0.W, KC0[3].Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.Z, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T1.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOV T2.X, PS,
+; EG-NEXT: MOV * T0.Y, T3.X,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i16:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 33, @17, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV * T3.X, PV.W,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: AND_INT * T0.W, KC0[3].W, literal.y,
+; CM-NEXT: -65536(nan), 65535(9.183409e-41)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T3.X, PV.W,
+; CM-NEXT: MOV * T0.Y, T2.X,
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[3].Z, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV * T2.X, PV.W,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: AND_INT T0.Y, PV.Y, literal.x,
+; CM-NEXT: AND_INT T0.Z, KC0[3].Y, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: -65536(nan), 65535(9.183409e-41)
+; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOV T2.X, PV.W,
+; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Y,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <4 x i32> %in to <4 x i16>
store <4 x i16> %0, ptr addrspace(5) %out
@@ -507,6 +1851,37 @@ entry:
; SI: buffer_store_dword
define amdgpu_kernel void @store_v2f32(ptr addrspace(5) %out, float %a, float %b) {
+; EG-LABEL: store_v2f32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 10, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.Y, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T0.Z, KC0[2].Z,
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Y,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v2f32:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 10, @18, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T0.Y, KC0[2].Z,
+; CM-NEXT: LSHR T0.Z, PV.W, literal.x,
+; CM-NEXT: MOV * T0.W, KC0[2].W,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; CM-NEXT: RETURN
entry:
%0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
%1 = insertelement <2 x float> %0, float %b, i32 1
@@ -536,6 +1911,49 @@ entry:
; SI: buffer_store_dword
define amdgpu_kernel void @store_v3i32(ptr addrspace(5) %out, <3 x i32> %a) nounwind {
+; EG-LABEL: store_v3i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 16, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, KC0[3].Y,
+; EG-NEXT: LSHR T0.Y, PS, literal.x,
+; EG-NEXT: MOV T1.Z, KC0[3].Z,
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[3].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Y,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.Z,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.X,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v3i32:
+; CM: ; %bb.0:
+; CM-NEXT: ALU 16, @19, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T0.Y, KC0[3].Y,
+; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.X, PV.W, literal.x,
+; CM-NEXT: MOV T1.Y, KC0[3].Z,
+; CM-NEXT: LSHR T0.Z, PV.Z, literal.x,
+; CM-NEXT: MOV * T0.W, KC0[3].W,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; CM-NEXT: RETURN
store <3 x i32> %a, ptr addrspace(5) %out, align 16
ret void
}
@@ -566,6 +1984,61 @@ define amdgpu_kernel void @store_v3i32(ptr addrspace(5) %out, <3 x i32> %a) noun
; SI: buffer_store_dword
; SI: buffer_store_dword
define amdgpu_kernel void @store_v4i32(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 22, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T0.Y, KC0[3].Y,
+; EG-NEXT: LSHR T0.Z, PV.W, literal.x,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: MOV T1.X, KC0[4].X,
+; EG-NEXT: LSHR T1.Y, PS, literal.x,
+; EG-NEXT: MOV T1.Z, KC0[3].W,
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[3].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.Y,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.Z,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.X,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i32:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 22, @20, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR T0.Y, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T0.Z, KC0[3].Y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 12(1.681558e-44)
+; CM-NEXT: LSHR T0.X, PV.W, literal.x,
+; CM-NEXT: MOV T1.Y, KC0[4].X,
+; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.X, PV.W, literal.x,
+; CM-NEXT: MOV T2.Y, KC0[3].W,
+; CM-NEXT: LSHR T1.Z, PV.Z, literal.x,
+; CM-NEXT: MOV * T0.W, KC0[3].Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.Y,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; CM-NEXT: RETURN
entry:
store <4 x i32> %in, ptr addrspace(5) %out
ret void
@@ -597,6 +2070,61 @@ entry:
; SI: buffer_store_dword
; SI: buffer_store_dword
define amdgpu_kernel void @store_v4i32_unaligned(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i32_unaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 22, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T0.Y, KC0[3].Y,
+; EG-NEXT: LSHR T0.Z, PV.W, literal.x,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: MOV T1.X, KC0[4].X,
+; EG-NEXT: LSHR T1.Y, PS, literal.x,
+; EG-NEXT: MOV T1.Z, KC0[3].W,
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[3].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.Y,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.Z,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.X,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i32_unaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 22, @21, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR T0.Y, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T0.Z, KC0[3].Y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 12(1.681558e-44)
+; CM-NEXT: LSHR T0.X, PV.W, literal.x,
+; CM-NEXT: MOV T1.Y, KC0[4].X,
+; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.X, PV.W, literal.x,
+; CM-NEXT: MOV T2.Y, KC0[3].W,
+; CM-NEXT: LSHR T1.Z, PV.Z, literal.x,
+; CM-NEXT: MOV * T0.W, KC0[3].Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.Y,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; CM-NEXT: RETURN
entry:
store <4 x i32> %in, ptr addrspace(5) %out, align 4
ret void
@@ -629,6 +2157,85 @@ entry:
; SI: buffer_store_dword
; SI: buffer_store_dword
define amdgpu_kernel void @store_v4f32(ptr addrspace(5) %out, ptr addrspace(5) %in) {
+; EG-LABEL: store_v4f32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 34, @22, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR T0.X, PS, literal.x,
+; EG-NEXT: ADD_INT T0.Y, KC0[2].Z, literal.y,
+; EG-NEXT: ADD_INT T1.Z, KC0[2].Z, literal.z,
+; EG-NEXT: ADD_INT T1.W, KC0[2].Z, literal.w,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w,
+; EG-NEXT: 2(2.802597e-45), 12(1.681558e-44)
+; EG-NEXT: 8(1.121039e-44), 4(5.605194e-45)
+; EG-NEXT: LSHR T1.X, PS, literal.x,
+; EG-NEXT: LSHR T1.Y, PV.W, literal.x,
+; EG-NEXT: LSHR T1.Z, PV.Z, literal.x,
+; EG-NEXT: LSHR T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHR * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PS,
+; EG-NEXT: MOV * T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
+; EG-NEXT: MOV * T1.W, T(0 + AR.x).X+,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; EG-NEXT: MOV * T1.Z, T(0 + AR.x).X+,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.Y,
+; EG-NEXT: MOV * T1.Y, T(0 + AR.x).X+,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.X,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.Y,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.Z,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4f32:
+; CM: ; %bb.0:
+; CM-NEXT: ALU 34, @22, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 8(1.121039e-44), 12(1.681558e-44)
+; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR T0.Y, PV.W, literal.x,
+; CM-NEXT: LSHR T0.Z, PV.Z, literal.x,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; CM-NEXT: LSHR T1.X, PV.W, literal.x,
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Z, literal.y,
+; CM-NEXT: ADD_INT T1.Z, KC0[2].Z, literal.z,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.w,
+; CM-NEXT: 2(2.802597e-45), 12(1.681558e-44)
+; CM-NEXT: 8(1.121039e-44), 4(5.605194e-45)
+; CM-NEXT: LSHR T2.X, PV.W, literal.x,
+; CM-NEXT: LSHR T2.Y, PV.Z, literal.x,
+; CM-NEXT: LSHR T1.Z, PV.Y, literal.x,
+; CM-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T0.W, T(0 + AR.x).X+,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T1.Y, T(0 + AR.x).X+,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Y,
+; CM-NEXT: MOV * T1.Z, T(0 + AR.x).X+,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.X,
+; CM-NEXT: MOV * T1.W, T(0 + AR.x).X+,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.Z,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.Y,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
%1 = load <4 x float>, ptr addrspace(5) %in
store <4 x float> %1, ptr addrspace(5) %out
ret void
@@ -647,6 +2254,50 @@ define amdgpu_kernel void @store_v4f32(ptr addrspace(5) %out, ptr addrspace(5) %
; SI: buffer_store_byte
define amdgpu_kernel void @store_i64_i8(ptr addrspace(5) %out, i64 %in) {
+; EG-LABEL: store_i64_i8:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 16, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T2.W, T0.X, PV.W,
+; EG-NEXT: LSHL * T1.W, T3.W, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i64_i8:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 17, @23, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T1.W, PV.Z, T1.W,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc i64 %in to i8
store i8 %0, ptr addrspace(5) %out
@@ -666,6 +2317,50 @@ entry:
; SI: buffer_store_short
define amdgpu_kernel void @store_i64_i16(ptr addrspace(5) %out, i64 %in) {
+; EG-LABEL: store_i64_i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 16, @24, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].W, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T2.W, T0.X, PV.W,
+; EG-NEXT: LSHL * T1.W, T3.W, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i64_i16:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 17, @24, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T1.W, PV.Z, T1.W,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc i64 %in to i16
store i16 %0, ptr addrspace(5) %out
@@ -692,6 +2387,41 @@ entry:
; SI: buffer_store_dword
; SI: buffer_store_dword
define amdgpu_kernel void @vecload2(ptr addrspace(5) nocapture %out, ptr addrspace(4) nocapture %mem) #0 {
+; EG-LABEL: vecload2:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 0, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU 8, @26, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PS,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.X,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: vecload2:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 0, @25, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MOV * T0.X, KC0[2].Z,
+; CM-NEXT: TEX 0 @0
+; CM-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; CM-NEXT: ALU 8, @26, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T0.Z, PV.W, literal.x,
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.X,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; CM-NEXT: RETURN
entry:
%0 = load i32, ptr addrspace(4) %mem, align 4
%arrayidx1.i = getelementptr inbounds i32, ptr addrspace(4) %mem, i64 1
@@ -743,3 +2473,6 @@ entry:
attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
+; SI: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/Windows/alloca.ll b/llvm/test/CodeGen/ARM/Windows/alloca.ll
index e014d287db6e90..d28836d1c70631 100644
--- a/llvm/test/CodeGen/ARM/Windows/alloca.ll
+++ b/llvm/test/CodeGen/ARM/Windows/alloca.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -O0 -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s
; RUN: llc -O0 -mtriple thumbv7-windows-msvc -filetype asm -o - %s | FileCheck %s
; RUN: llc -O0 -mtriple thumbv7-windows-mingw32 -filetype asm -o - %s | FileCheck %s
@@ -25,3 +26,5 @@ entry:
; CHECK: bl __chkstk
; CHECK: sub.w sp, sp, r4
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/Windows/vla.ll b/llvm/test/CodeGen/ARM/Windows/vla.ll
index 459db0c290b5a0..3adca905850c2d 100644
--- a/llvm/test/CodeGen/ARM/Windows/vla.ll
+++ b/llvm/test/CodeGen/ARM/Windows/vla.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s \
; RUN: | FileCheck %s -check-prefix CHECK-SMALL-CODE
; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -code-model=large -o - %s \
@@ -26,3 +27,6 @@ entry:
; CHECK-LARGE-CODE: movt [[IP]], :upper16:__chkstk
; CHECK-LARGE-CODE: blx [[IP]]
; CHECK-LARGE-CODE: sub.w sp, sp, r4
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-LARGE-CODE: {{.*}}
+; CHECK-SMALL-CODE: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/and-cmpz.ll b/llvm/test/CodeGen/ARM/and-cmpz.ll
index 1f72307f12a682..e1c9fe52911b99 100644
--- a/llvm/test/CodeGen/ARM/and-cmpz.ll
+++ b/llvm/test/CodeGen/ARM/and-cmpz.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=thumbv7m-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=T2
; RUN: llc -mtriple=thumbv6m-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=T1
@@ -7,6 +8,24 @@
; T2-NEXT: it
; T1-NEXT: bmi
define i32 @single_bit(i32 %p) {
+; T2-LABEL: single_bit:
+; T2: @ %bb.0: @ %common.ret
+; T2-NEXT: lsls r0, r0, #23
+; T2-NEXT: mov.w r0, #2
+; T2-NEXT: it pl
+; T2-NEXT: movpl r0, #1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: single_bit:
+; T1: @ %bb.0:
+; T1-NEXT: lsls r0, r0, #23
+; T1-NEXT: bmi .LBB0_2
+; T1-NEXT: @ %bb.1: @ %true
+; T1-NEXT: movs r0, #1
+; T1-NEXT: bx lr
+; T1-NEXT: .LBB0_2: @ %false
+; T1-NEXT: movs r0, #2
+; T1-NEXT: bx lr
%a = and i32 %p, 256
%b = icmp eq i32 %a, 0
br i1 %b, label %true, label %false
@@ -24,6 +43,26 @@ false:
; T2-NEXT: it
; T1-NEXT: bmi
define i32 @single_bit_multi_use(i32 %p, ptr %z) {
+; T2-LABEL: single_bit_multi_use:
+; T2: @ %bb.0: @ %common.ret
+; T2-NEXT: str r0, [r1]
+; T2-NEXT: lsls r0, r0, #23
+; T2-NEXT: mov.w r0, #2
+; T2-NEXT: it pl
+; T2-NEXT: movpl r0, #1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: single_bit_multi_use:
+; T1: @ %bb.0:
+; T1-NEXT: str r0, [r1]
+; T1-NEXT: lsls r0, r0, #23
+; T1-NEXT: bmi .LBB1_2
+; T1-NEXT: @ %bb.1: @ %true
+; T1-NEXT: movs r0, #1
+; T1-NEXT: bx lr
+; T1-NEXT: .LBB1_2: @ %false
+; T1-NEXT: movs r0, #2
+; T1-NEXT: bx lr
store i32 %p, ptr %z
%a = and i32 %p, 256
%b = icmp eq i32 %a, 0
@@ -42,6 +81,24 @@ false:
; T2-NEXT: it
; T1-NEXT: beq
define i32 @multi_bit_lsb_ubfx(i32 %p) {
+; T2-LABEL: multi_bit_lsb_ubfx:
+; T2: @ %bb.0: @ %common.ret
+; T2-NEXT: lsls r0, r0, #24
+; T2-NEXT: mov.w r0, #2
+; T2-NEXT: it eq
+; T2-NEXT: moveq r0, #1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: multi_bit_lsb_ubfx:
+; T1: @ %bb.0:
+; T1-NEXT: lsls r0, r0, #24
+; T1-NEXT: beq .LBB2_2
+; T1-NEXT: @ %bb.1: @ %false
+; T1-NEXT: movs r0, #2
+; T1-NEXT: bx lr
+; T1-NEXT: .LBB2_2: @ %true
+; T1-NEXT: movs r0, #1
+; T1-NEXT: bx lr
%a = and i32 %p, 255
%b = icmp eq i32 %a, 0
br i1 %b, label %true, label %false
@@ -59,6 +116,24 @@ false:
; T2-NEXT: it
; T1-NEXT: beq
define i32 @multi_bit_msb(i32 %p) {
+; T2-LABEL: multi_bit_msb:
+; T2: @ %bb.0: @ %common.ret
+; T2-NEXT: lsrs r0, r0, #24
+; T2-NEXT: mov.w r0, #2
+; T2-NEXT: it eq
+; T2-NEXT: moveq r0, #1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: multi_bit_msb:
+; T1: @ %bb.0:
+; T1-NEXT: lsrs r0, r0, #24
+; T1-NEXT: beq .LBB3_2
+; T1-NEXT: @ %bb.1: @ %false
+; T1-NEXT: movs r0, #2
+; T1-NEXT: bx lr
+; T1-NEXT: .LBB3_2: @ %true
+; T1-NEXT: movs r0, #1
+; T1-NEXT: bx lr
%a = and i32 %p, 4278190080 ; 0xff000000
%b = icmp eq i32 %a, 0
br i1 %b, label %true, label %false
@@ -77,6 +152,26 @@ false:
; T2-NEXT: it
; T1-NEXT: beq
define i32 @multi_bit_nosb(i32 %p) {
+; T2-LABEL: multi_bit_nosb:
+; T2: @ %bb.0: @ %common.ret
+; T2-NEXT: movs r1, #2
+; T2-NEXT: tst.w r0, #16711680
+; T2-NEXT: it eq
+; T2-NEXT: moveq r1, #1
+; T2-NEXT: mov r0, r1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: multi_bit_nosb:
+; T1: @ %bb.0:
+; T1-NEXT: lsls r0, r0, #8
+; T1-NEXT: lsrs r0, r0, #24
+; T1-NEXT: beq .LBB4_2
+; T1-NEXT: @ %bb.1: @ %false
+; T1-NEXT: movs r0, #2
+; T1-NEXT: bx lr
+; T1-NEXT: .LBB4_2: @ %true
+; T1-NEXT: movs r0, #1
+; T1-NEXT: bx lr
%a = and i32 %p, 16711680 ; 0x00ff0000
%b = icmp eq i32 %a, 0
br i1 %b, label %true, label %false
@@ -96,6 +191,28 @@ false:
; T2-NEXT: movs r2, #0
; T2-NEXT: cmp.w r2, r0, lsr #9
define void @i16_cmpz(i16 %x, ptr %foo) {
+; T2-LABEL: i16_cmpz:
+; T2: @ %bb.0: @ %entry
+; T2-NEXT: uxth r0, r0
+; T2-NEXT: movs r2, #0
+; T2-NEXT: cmp.w r2, r0, lsr #9
+; T2-NEXT: it ne
+; T2-NEXT: bxne lr
+; T2-NEXT: .LBB5_1: @ %if.then
+; T2-NEXT: movs r0, #0
+; T2-NEXT: bx r1
+;
+; T1-LABEL: i16_cmpz:
+; T1: @ %bb.0: @ %entry
+; T1-NEXT: push {r7, lr}
+; T1-NEXT: uxth r0, r0
+; T1-NEXT: lsrs r0, r0, #9
+; T1-NEXT: bne .LBB5_2
+; T1-NEXT: @ %bb.1: @ %if.then
+; T1-NEXT: movs r0, #0
+; T1-NEXT: blx r1
+; T1-NEXT: .LBB5_2: @ %if.end
+; T1-NEXT: pop {r7, pc}
entry:
%cmp = icmp ult i16 %x, 512
br i1 %cmp, label %if.then, label %if.end
@@ -107,3 +224,5 @@ if.then: ; preds = %entry
if.end: ; preds = %if.then, %entry
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/bfx.ll b/llvm/test/CodeGen/ARM/bfx.ll
index a585fc8be9ede1..fdde6be286b2bd 100644
--- a/llvm/test/CodeGen/ARM/bfx.ll
+++ b/llvm/test/CodeGen/ARM/bfx.ll
@@ -1,8 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=arm-eabi -mattr=+v7 %s -o - | FileCheck %s
define i32 @sbfx1(i32 %a) {
-; CHECK: sbfx1
-; CHECK: sbfx r0, r0, #7, #11
+; CHECK-LABEL: sbfx1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: sbfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = trunc i32 %t1 to i11
%t3 = sext i11 %t2 to i32
@@ -10,8 +13,10 @@ define i32 @sbfx1(i32 %a) {
}
define i32 @ubfx1(i32 %a) {
-; CHECK: ubfx1
-; CHECK: ubfx r0, r0, #7, #11
+; CHECK-LABEL: ubfx1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = trunc i32 %t1 to i11
%t3 = zext i11 %t2 to i32
@@ -19,8 +24,10 @@ define i32 @ubfx1(i32 %a) {
}
define i32 @ubfx2(i32 %a) {
-; CHECK: ubfx2
-; CHECK: ubfx r0, r0, #7, #11
+; CHECK-LABEL: ubfx2:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = and i32 %t1, 2047
ret i32 %t2
@@ -28,14 +35,18 @@ define i32 @ubfx2(i32 %a) {
; rdar://12870177
define i32 @ubfx_opt(ptr nocapture %ctx, i32 %x) nounwind readonly ssp {
+; CHECK-LABEL: ubfx_opt:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: lsr r2, r1, #24
+; CHECK-NEXT: ldr r2, [r0, r2, lsl #2]
+; CHECK-NEXT: ubfx r3, r1, #16, #8
+; CHECK-NEXT: ldr r3, [r0, r3, lsl #2]
+; CHECK-NEXT: ubfx r1, r1, #8, #8
+; CHECK-NEXT: ldr r0, [r0, r1, lsl #2]
+; CHECK-NEXT: add r2, r3, r2
+; CHECK-NEXT: add r0, r2, r0
+; CHECK-NEXT: bx lr
entry:
-; CHECK: ubfx_opt
-; CHECK: lsr [[REG1:(lr|r[0-9]+)]], r1, #24
-; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG1]], lsl #2]
-; CHECK: ubfx [[REG2:(lr|r[0-9]+)]], r1, #16, #8
-; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG2]], lsl #2]
-; CHECK: ubfx [[REG3:(lr|r[0-9]+)]], r1, #8, #8
-; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG3]], lsl #2]
%and = lshr i32 %x, 8
%shr = and i32 %and, 255
%and1 = lshr i32 %x, 16
@@ -53,16 +64,20 @@ entry:
}
define i32 @ubfx3(i32 %a) {
-; CHECK: ubfx3
-; CHECK: ubfx r0, r0, #11, #1
+; CHECK-LABEL: ubfx3:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #11, #1
+; CHECK-NEXT: bx lr
%t1 = and i32 %a, 2048
%t2 = lshr i32 %t1, 11
ret i32 %t2
}
define i32 @ubfx4(i32 %a) {
-; CHECK: ubfx4
-; CHECK: ubfx r0, r0, #7, #3
+; CHECK-LABEL: ubfx4:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #7, #3
+; CHECK-NEXT: bx lr
%t1 = and i32 %a, 896
%t2 = lshr i32 %t1, 7
ret i32 %t2
diff --git a/llvm/test/CodeGen/ARM/sbfx.ll b/llvm/test/CodeGen/ARM/sbfx.ll
index 5b77c59bca967a..72e9b5b1c9c425 100644
--- a/llvm/test/CodeGen/ARM/sbfx.ll
+++ b/llvm/test/CodeGen/ARM/sbfx.ll
@@ -1,46 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=arm-eabi -mattr=+v6t2 %s -o - | FileCheck %s
define i32 @f1(i32 %a) {
-entry:
; CHECK-LABEL: f1:
-; CHECK: sbfx r0, r0, #0, #20
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sbfx r0, r0, #0, #20
+; CHECK-NEXT: bx lr
+entry:
%tmp = shl i32 %a, 12
%tmp2 = ashr i32 %tmp, 12
ret i32 %tmp2
}
define i32 @f2(i32 %a) {
-entry:
; CHECK-LABEL: f2:
-; CHECK: bfc r0, #20, #12
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: bfc r0, #20, #12
+; CHECK-NEXT: bx lr
+entry:
%tmp = shl i32 %a, 12
%tmp2 = lshr i32 %tmp, 12
ret i32 %tmp2
}
define i32 @f3(i32 %a) {
-entry:
; CHECK-LABEL: f3:
-; CHECK: sbfx r0, r0, #5, #3
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sbfx r0, r0, #5, #3
+; CHECK-NEXT: bx lr
+entry:
%tmp = shl i32 %a, 24
%tmp2 = ashr i32 %tmp, 29
ret i32 %tmp2
}
define i32 @f4(i32 %a) {
-entry:
; CHECK-LABEL: f4:
-; CHECK: ubfx r0, r0, #5, #3
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ubfx r0, r0, #5, #3
+; CHECK-NEXT: bx lr
+entry:
%tmp = shl i32 %a, 24
%tmp2 = lshr i32 %tmp, 29
ret i32 %tmp2
}
define i32 @f5(i32 %a) {
-entry:
; CHECK-LABEL: f5:
-; CHECK-NOT: sbfx
-; CHECK: bx
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: lsl r0, r0, #3
+; CHECK-NEXT: asr r0, r0, #1
+; CHECK-NEXT: bx lr
+entry:
%tmp = shl i32 %a, 3
%tmp2 = ashr i32 %tmp, 1
ret i32 %tmp2
@@ -48,7 +59,9 @@ entry:
define signext i8 @f6(i32 %a) {
; CHECK-LABEL: f6:
-; CHECK: sbfx r0, r0, #23, #8
+; CHECK: @ %bb.0:
+; CHECK-NEXT: sbfx r0, r0, #23, #8
+; CHECK-NEXT: bx lr
%tmp = lshr i32 %a, 23
%res = trunc i32 %tmp to i8
@@ -57,7 +70,9 @@ define signext i8 @f6(i32 %a) {
define signext i8 @f7(i32 %a) {
; CHECK-LABEL: f7:
-; CHECK-NOT: sbfx
+; CHECK: @ %bb.0:
+; CHECK-NEXT: lsr r0, r0, #25
+; CHECK-NEXT: bx lr
%tmp = lshr i32 %a, 25
%res = trunc i32 %tmp to i8
diff --git a/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll b/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
index a9eda31e729e2c..58a5bf1bda71da 100644
--- a/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
+++ b/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
@@ -1,13 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=armv7a -mattr=+hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,DIV
; RUN: llc -mtriple=armv7a -mattr=-hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,NODIV
; Check SREM
define dso_local i32 @test_rem(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: test_rem
-; CHECK: asr r1, r0, #31
-; CHECK-NEXT: add r1, r0, r1, lsr #30
-; CHECK-NEXT: bic r1, r1, #3
-; CHECK-NEXT: sub r0, r0, r1
+; CHECK-LABEL: test_rem:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: asr r1, r0, #31
+; CHECK-NEXT: add r1, r0, r1, lsr #30
+; CHECK-NEXT: bic r1, r1, #3
+; CHECK-NEXT: sub r0, r0, r1
+; CHECK-NEXT: bx lr
entry:
%div = srem i32 %F, 4
@@ -16,18 +19,22 @@ entry:
; Try an i16 sdiv, with a small immediate.
define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f0
+; DIV-LABEL: f0:
+; DIV: @ %bb.0: @ %entry
+; DIV-NEXT: mov r1, #2
+; DIV-NEXT: sdiv r0, r0, r1
+; DIV-NEXT: sxth r0, r0
+; DIV-NEXT: bx lr
+;
+; NODIV-LABEL: f0:
+; NODIV: @ %bb.0: @ %entry
+; NODIV-NEXT: uxth r1, r0
+; NODIV-NEXT: add r0, r0, r1, lsr #15
+; NODIV-NEXT: sxth r0, r0
+; NODIV-NEXT: asr r0, r0, #1
+; NODIV-NEXT: bx lr
-; DIV: mov r1, #2
-; DIV-NEXT: sdiv r0, r0, r1
-; DIV-NEXT: sxth r0, r0
-; DIV-NEXT: bx lr
-; NODIV: uxth r1, r0
-; NODIV-NEXT: add r0, r0, r1, lsr #15
-; NODIV-NEXT: sxth r0, r0
-; NODIV-NEXT: asr r0, r0, #1
-; NODIV-NEXT: bx lr
entry:
%0 = sdiv i16 %F, 2
@@ -36,16 +43,20 @@ entry:
; Try an i32 sdiv, with a small immediate.
define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f1
+; DIV-LABEL: f1:
+; DIV: @ %bb.0: @ %entry
+; DIV-NEXT: mov r1, #4
+; DIV-NEXT: sdiv r0, r0, r1
+; DIV-NEXT: bx lr
+;
+; NODIV-LABEL: f1:
+; NODIV: @ %bb.0: @ %entry
+; NODIV-NEXT: asr r1, r0, #31
+; NODIV-NEXT: add r0, r0, r1, lsr #30
+; NODIV-NEXT: asr r0, r0, #2
+; NODIV-NEXT: bx lr
-; DIV: mov r1, #4
-; DIV-NEXT: sdiv r0, r0, r1
-; DIV-NEXT: bx lr
-; NODIV: asr r1, r0, #31
-; NODIV-NEXT: add r0, r0, r1, lsr #30
-; NODIV-NEXT: asr r0, r0, #2
-; NODIV-NEXT: bx lr
entry:
%div = sdiv i32 %F, 4
@@ -55,10 +66,18 @@ entry:
; Try a large power of 2 immediate, which should also be materialised with 1
; move immediate instruction.
define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f2
-; DIV: mov r1, #131072
-; DIV-NEXT: sdiv r0, r0, r1
-; DIV-NEXT: bx lr
+; DIV-LABEL: f2:
+; DIV: @ %bb.0: @ %entry
+; DIV-NEXT: mov r1, #131072
+; DIV-NEXT: sdiv r0, r0, r1
+; DIV-NEXT: bx lr
+;
+; NODIV-LABEL: f2:
+; NODIV: @ %bb.0: @ %entry
+; NODIV-NEXT: asr r1, r0, #31
+; NODIV-NEXT: add r0, r0, r1, lsr #15
+; NODIV-NEXT: asr r0, r0, #17
+; NODIV-NEXT: bx lr
entry:
%div = sdiv i32 %F, 131072
ret i32 %div
@@ -66,11 +85,12 @@ entry:
; MinSize not set, so should expand to the faster but longer sequence.
define dso_local i32 @f3(i32 %F) {
-; CHECK-LABEL: f3
-; CHECK: asr r1, r0, #31
-; CHECK-NEXT: add r0, r0, r1, lsr #30
-; CHECK-NEXT: asr r0, r0, #2
-; CHECK-NEXT: bx lr
+; CHECK-LABEL: f3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: asr r1, r0, #31
+; CHECK-NEXT: add r0, r0, r1, lsr #30
+; CHECK-NEXT: asr r0, r0, #2
+; CHECK-NEXT: bx lr
entry:
%div = sdiv i32 %F, 4
ret i32 %div
diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll
index 66417cddd4d566..196d9340a7ce59 100644
--- a/llvm/test/CodeGen/ARM/shift-combine.ll
+++ b/llvm/test/CodeGen/ARM/shift-combine.ll
@@ -1240,6 +1240,67 @@ define <4 x i32> @or_tree_with_shifts_vec_i32(<4 x i32> %a, <4 x i32> %b, <4 x i
; CHECK-BE-NEXT: vorr q8, q8, q10
; CHECK-BE-NEXT: vrev64.32 q0, q8
; CHECK-BE-NEXT: bx lr
+;
+; CHECK-ALIGN-LABEL: or_tree_with_shifts_vec_i32:
+; CHECK-ALIGN: @ %bb.0:
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #16]
+; CHECK-ALIGN-NEXT: orr.w r12, r12, r0
+; CHECK-ALIGN-NEXT: ldr r0, [sp]
+; CHECK-ALIGN-NEXT: orr.w r12, r0, r12, lsl #16
+; CHECK-ALIGN-NEXT: ldr r0, [sp, #32]
+; CHECK-ALIGN-NEXT: orr.w r0, r0, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #20]
+; CHECK-ALIGN-NEXT: orr.w r12, r12, r1
+; CHECK-ALIGN-NEXT: ldr r1, [sp, #4]
+; CHECK-ALIGN-NEXT: orr.w r12, r1, r12, lsl #16
+; CHECK-ALIGN-NEXT: ldr r1, [sp, #36]
+; CHECK-ALIGN-NEXT: orr.w r1, r1, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #24]
+; CHECK-ALIGN-NEXT: orr.w r12, r12, r2
+; CHECK-ALIGN-NEXT: ldr r2, [sp, #8]
+; CHECK-ALIGN-NEXT: orr.w r12, r2, r12, lsl #16
+; CHECK-ALIGN-NEXT: ldr r2, [sp, #40]
+; CHECK-ALIGN-NEXT: orr.w r2, r2, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #28]
+; CHECK-ALIGN-NEXT: orr.w r12, r12, r3
+; CHECK-ALIGN-NEXT: ldr r3, [sp, #12]
+; CHECK-ALIGN-NEXT: orr.w r12, r3, r12, lsl #16
+; CHECK-ALIGN-NEXT: ldr r3, [sp, #44]
+; CHECK-ALIGN-NEXT: orr.w r3, r3, r12
+; CHECK-ALIGN-NEXT: bx lr
+;
+; CHECK-V6M-LABEL: or_tree_with_shifts_vec_i32:
+; CHECK-V6M: @ %bb.0:
+; CHECK-V6M-NEXT: push {r4, lr}
+; CHECK-V6M-NEXT: ldr r4, [sp, #24]
+; CHECK-V6M-NEXT: orrs r4, r0
+; CHECK-V6M-NEXT: lsls r0, r4, #16
+; CHECK-V6M-NEXT: ldr r4, [sp, #8]
+; CHECK-V6M-NEXT: orrs r4, r0
+; CHECK-V6M-NEXT: ldr r0, [sp, #40]
+; CHECK-V6M-NEXT: orrs r0, r4
+; CHECK-V6M-NEXT: ldr r4, [sp, #28]
+; CHECK-V6M-NEXT: orrs r4, r1
+; CHECK-V6M-NEXT: lsls r1, r4, #16
+; CHECK-V6M-NEXT: ldr r4, [sp, #12]
+; CHECK-V6M-NEXT: orrs r4, r1
+; CHECK-V6M-NEXT: ldr r1, [sp, #44]
+; CHECK-V6M-NEXT: orrs r1, r4
+; CHECK-V6M-NEXT: ldr r4, [sp, #32]
+; CHECK-V6M-NEXT: orrs r4, r2
+; CHECK-V6M-NEXT: lsls r2, r4, #16
+; CHECK-V6M-NEXT: ldr r4, [sp, #16]
+; CHECK-V6M-NEXT: orrs r4, r2
+; CHECK-V6M-NEXT: ldr r2, [sp, #48]
+; CHECK-V6M-NEXT: orrs r2, r4
+; CHECK-V6M-NEXT: ldr r4, [sp, #36]
+; CHECK-V6M-NEXT: orrs r4, r3
+; CHECK-V6M-NEXT: lsls r3, r4, #16
+; CHECK-V6M-NEXT: ldr r4, [sp, #20]
+; CHECK-V6M-NEXT: orrs r4, r3
+; CHECK-V6M-NEXT: ldr r3, [sp, #52]
+; CHECK-V6M-NEXT: orrs r3, r4
+; CHECK-V6M-NEXT: pop {r4, pc}
%a.shifted = shl <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
%c.shifted = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
%or.ab = or <4 x i32> %a.shifted, %b
@@ -1271,6 +1332,72 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
; CHECK-BE-NEXT: vorr q8, q9, q8
; CHECK-BE-NEXT: vrev64.32 q0, q8
; CHECK-BE-NEXT: bx lr
+;
+; CHECK-ALIGN-LABEL: or_tree_with_mismatching_shifts_vec_i32:
+; CHECK-ALIGN: @ %bb.0:
+; CHECK-ALIGN-NEXT: push {r7, lr}
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #24]
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #40]
+; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #8]
+; CHECK-ALIGN-NEXT: orr.w r0, lr, r0, lsl #16
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #44]
+; CHECK-ALIGN-NEXT: orr.w r0, r0, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #28]
+; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #12]
+; CHECK-ALIGN-NEXT: orr.w r1, lr, r1, lsl #16
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #48]
+; CHECK-ALIGN-NEXT: orr.w r1, r1, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #32]
+; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #16]
+; CHECK-ALIGN-NEXT: orr.w r2, lr, r2, lsl #16
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #52]
+; CHECK-ALIGN-NEXT: orr.w r2, r2, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #36]
+; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #20]
+; CHECK-ALIGN-NEXT: orr.w r3, lr, r3, lsl #16
+; CHECK-ALIGN-NEXT: orr.w r3, r3, r12
+; CHECK-ALIGN-NEXT: pop {r7, pc}
+;
+; CHECK-V6M-LABEL: or_tree_with_mismatching_shifts_vec_i32:
+; CHECK-V6M: @ %bb.0:
+; CHECK-V6M-NEXT: push {r4, r5, r7, lr}
+; CHECK-V6M-NEXT: ldr r4, [sp, #32]
+; CHECK-V6M-NEXT: lsls r4, r4, #17
+; CHECK-V6M-NEXT: ldr r5, [sp, #48]
+; CHECK-V6M-NEXT: orrs r5, r4
+; CHECK-V6M-NEXT: lsls r4, r0, #16
+; CHECK-V6M-NEXT: ldr r0, [sp, #16]
+; CHECK-V6M-NEXT: orrs r0, r4
+; CHECK-V6M-NEXT: orrs r0, r5
+; CHECK-V6M-NEXT: ldr r4, [sp, #36]
+; CHECK-V6M-NEXT: lsls r4, r4, #17
+; CHECK-V6M-NEXT: ldr r5, [sp, #52]
+; CHECK-V6M-NEXT: orrs r5, r4
+; CHECK-V6M-NEXT: lsls r4, r1, #16
+; CHECK-V6M-NEXT: ldr r1, [sp, #20]
+; CHECK-V6M-NEXT: orrs r1, r4
+; CHECK-V6M-NEXT: orrs r1, r5
+; CHECK-V6M-NEXT: ldr r4, [sp, #40]
+; CHECK-V6M-NEXT: lsls r4, r4, #17
+; CHECK-V6M-NEXT: ldr r5, [sp, #56]
+; CHECK-V6M-NEXT: orrs r5, r4
+; CHECK-V6M-NEXT: lsls r4, r2, #16
+; CHECK-V6M-NEXT: ldr r2, [sp, #24]
+; CHECK-V6M-NEXT: orrs r2, r4
+; CHECK-V6M-NEXT: orrs r2, r5
+; CHECK-V6M-NEXT: ldr r4, [sp, #44]
+; CHECK-V6M-NEXT: lsls r4, r4, #17
+; CHECK-V6M-NEXT: ldr r5, [sp, #60]
+; CHECK-V6M-NEXT: orrs r5, r4
+; CHECK-V6M-NEXT: lsls r4, r3, #16
+; CHECK-V6M-NEXT: ldr r3, [sp, #28]
+; CHECK-V6M-NEXT: orrs r3, r4
+; CHECK-V6M-NEXT: orrs r3, r5
+; CHECK-V6M-NEXT: pop {r4, r5, r7, pc}
%a.shifted = shl <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
%c.shifted = shl <4 x i32> %c, <i32 17, i32 17, i32 17, i32 17>
%or.ab = or <4 x i32> %a.shifted, %b
diff --git a/llvm/test/CodeGen/BPF/remove_truncate_9.ll b/llvm/test/CodeGen/BPF/remove_truncate_9.ll
index 3b9293d38fd01f..7656943ad3049a 100644
--- a/llvm/test/CodeGen/BPF/remove_truncate_9.ll
+++ b/llvm/test/CodeGen/BPF/remove_truncate_9.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mcpu=v2 -march=bpf < %s | FileCheck %s
; RUN: llc -mcpu=v4 -march=bpf < %s | FileCheck %s
@@ -79,3 +80,5 @@ declare void @sink1(i8, i64, i64, i64, i1);
declare void @sink2(i16, i64, i64, i64, i1);
declare void @sink3(i8, i1);
declare void @sink4(i32, i1);
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/Mips/cins.ll b/llvm/test/CodeGen/Mips/cins.ll
index 4fe25564d1c12d..d00138a3ce37a7 100644
--- a/llvm/test/CodeGen/Mips/cins.ll
+++ b/llvm/test/CodeGen/Mips/cins.ll
@@ -1,92 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -march=mips64 -mcpu=octeon -target-abi=n64 < %s -o - | FileCheck %s
define i64 @cins_zext(i32 signext %n) {
+; CHECK-LABEL: cins_zext:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 5, 26
entry:
%shl = shl i32 %n, 5
%conv = zext i32 %shl to i64
ret i64 %conv
-; CHECK-LABEL: cins_zext:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 5, 26
}
define i64 @cins_and_shl(i64 zeroext %n) {
+; CHECK-LABEL: cins_and_shl:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 8, 15
entry:
%and = shl i64 %n, 8
%shl = and i64 %and, 16776960
ret i64 %shl
-; CHECK-LABEL: cins_and_shl:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 8, 15
}
define i64 @cins_and_shl32(i64 zeroext %n) {
+; CHECK-LABEL: cins_and_shl32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins32 $2, $4, 6, 15
entry:
%and = shl i64 %n, 38
%shl = and i64 %and, 18014123631575040
ret i64 %shl
-; CHECK-LABEL: cins_and_shl32:
-; CHECK: cins32 $[[R0:[0-9]+]], $[[R1:[0-9]+]], 6, 15
}
define zeroext i16 @cins_and_shl_16(i16 zeroext %n) {
+; CHECK-LABEL: cins_and_shl_16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 2, 3
entry:
%0 = shl i16 %n, 2
%1 = and i16 %0, 60
ret i16 %1
-; CHECK-LABEL: cins_and_shl_16:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 2, 3
}
define zeroext i8 @cins_and_shl_8(i8 zeroext %n) {
+; CHECK-LABEL: cins_and_shl_8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 2, 1
entry:
%0 = shl i8 %n, 2
%1 = and i8 %0, 12
ret i8 %1
-; CHECK-LABEL: cins_and_shl_8:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 2, 1
}
define i32 @cins_i32(i32 signext %a) {
+; CHECK-LABEL: cins_i32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 17, 11
entry:
%and = shl i32 %a, 17
%shl = and i32 %and, 536739840
ret i32 %shl
-; CHECK-LABEL: cins_i32:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 17, 11
}
define i64 @cins_shl_and(i32 signext %n) {
+; CHECK-LABEL: cins_shl_and:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 31, 15
entry:
%and = and i32 %n, 65535
%conv = zext i32 %and to i64
%shl = shl nuw nsw i64 %conv, 31
ret i64 %shl
-; CHECK-LABEL: cins_shl_and:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 31, 15
}
define i64 @cins_shl_and32(i32 signext %n) {
+; CHECK-LABEL: cins_shl_and32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins32 $2, $4, 15, 15
entry:
%and = and i32 %n, 65535
%conv = zext i32 %and to i64
%shl = shl nuw nsw i64 %conv, 47
ret i64 %shl
-; CHECK-LABEL: cins_shl_and32:
-; CHECK: cins32 $[[R0:[0-9]+]], $[[R1:[0-9]+]], 15, 15
}
diff --git a/llvm/test/CodeGen/Mips/fabs.ll b/llvm/test/CodeGen/Mips/fabs.ll
index 75aa7d8295f94e..e596d93509feeb 100644
--- a/llvm/test/CodeGen/Mips/fabs.ll
+++ b/llvm/test/CodeGen/Mips/fabs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Check that abs.[ds] is only selected for mips32r6 or mips64r6 when no
; additional options are passed. For revisions prior mips32r6 and mips64r6,
; abs.[ds] does not generate the correct result when working with NaNs, and
@@ -73,12 +74,13 @@
; RUN: -enable-no-nans-fp-math | FileCheck %s -check-prefix=CHECK-ABS2008
define float @foo0(float %a) nounwind readnone {
+; CHECK-ABS2008-LABEL: foo0:
+; CHECK-ABS2008: # %bb.0: # %entry
+; CHECK-ABS2008-NEXT: jr $ra
+; CHECK-ABS2008-NEXT: abs.s $f0, $f12
entry:
; CHECK-LABEL: foo0
-; CHECK-ABS2008: abs.s
-; CHECK-ABSLEGACY: {{(ori|ins)}}
-; CHECK-ABSLEGACY-NOT: abs.s
%call = tail call float @fabsf(float %a) nounwind readnone
ret float %call
@@ -87,15 +89,18 @@ entry:
declare float @fabsf(float) nounwind readnone
define double @foo1(double %a) nounwind readnone {
+; CHECK-ABS2008-LABEL: foo1:
+; CHECK-ABS2008: # %bb.0: # %entry
+; CHECK-ABS2008-NEXT: jr $ra
+; CHECK-ABS2008-NEXT: abs.d $f0, $f12
entry:
; CHECK-LABEL: foo1:
-; CHECK-ABS2008: abs.d
-; CHECK-ABSLEGACY: {{(ori|ins|dsll)}}
-; CHECK-ABSLEGACY-NOT: abs.d
%call = tail call double @fabs(double %a) nounwind readnone
ret double %call
}
declare double @fabs(double) nounwind readnone
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-ABSLEGACY: {{.*}}
diff --git a/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll b/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll
index 695431a5ab6074..cc2c674f89586b 100644
--- a/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll
+++ b/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -verify-machineinstrs -march=mips64el -mcpu=mips4 \
; RUN: -target-abi=n64 | FileCheck %s -check-prefixes=ALL,64
; RUN: llc < %s -verify-machineinstrs -march=mips64el -mcpu=mips64 \
@@ -10,21 +11,46 @@ declare double @copysign(double, double) nounwind readnone
declare float @copysignf(float, float) nounwind readnone
define float @func2(float %d, double %f) nounwind readnone {
+; 64-LABEL: func2:
+; 64: # %bb.0: # %entry
+; 64-NEXT: lui $1, %highest(.LCPI0_0)
+; 64-NEXT: daddiu $1, $1, %higher(.LCPI0_0)
+; 64-NEXT: dsll $1, $1, 16
+; 64-NEXT: daddiu $1, $1, %hi(.LCPI0_0)
+; 64-NEXT: dsll $1, $1, 16
+; 64-NEXT: lwc1 $f0, %lo(.LCPI0_0)($1)
+; 64-NEXT: add.s $f0, $f12, $f0
+; 64-NEXT: mfc1 $1, $f0
+; 64-NEXT: dmfc1 $2, $f13
+; 64-NEXT: lui $3, 32767
+; 64-NEXT: ori $3, $3, 65535
+; 64-NEXT: and $1, $1, $3
+; 64-NEXT: dsrl $2, $2, 63
+; 64-NEXT: sll $2, $2, 0
+; 64-NEXT: sll $2, $2, 31
+; 64-NEXT: or $1, $1, $2
+; 64-NEXT: jr $ra
+; 64-NEXT: mtc1 $1, $f0
+;
+; 64R2-LABEL: func2:
+; 64R2: # %bb.0: # %entry
+; 64R2-NEXT: lui $1, %highest(.LCPI0_0)
+; 64R2-NEXT: daddiu $1, $1, %higher(.LCPI0_0)
+; 64R2-NEXT: dsll $1, $1, 16
+; 64R2-NEXT: daddiu $1, $1, %hi(.LCPI0_0)
+; 64R2-NEXT: dsll $1, $1, 16
+; 64R2-NEXT: lwc1 $f0, %lo(.LCPI0_0)($1)
+; 64R2-NEXT: add.s $f0, $f12, $f0
+; 64R2-NEXT: mfc1 $1, $f0
+; 64R2-NEXT: dmfc1 $2, $f13
+; 64R2-NEXT: dextu $2, $2, 63, 1
+; 64R2-NEXT: sll $2, $2, 0
+; 64R2-NEXT: ins $1, $2, 31, 1
+; 64R2-NEXT: jr $ra
+; 64R2-NEXT: mtc1 $1, $f0
entry:
-; ALL-LABEL: func2:
-; 64-DAG: lui $[[T0:[0-9]+]], 32767
-; 64-DAG: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
-; 64-DAG: and $[[AND0:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 64-DAG: dsrl $[[DSRL:[0-9]+]], ${{[0-9]+}}, 63
-; 64-DAG: sll $[[SLL0:[0-9]+]], $[[DSRL]], 0
-; 64-DAG: sll $[[SLL1:[0-9]+]], $[[SLL0]], 31
-; 64: or $[[OR:[0-9]+]], $[[AND0]], $[[SLL1]]
-; 64: mtc1 $[[OR]], $f0
-; 64R2: dextu ${{[0-9]+}}, ${{[0-9]+}}, 63, 1
-; 64R2: ins $[[INS:[0-9]+]], ${{[0-9]+}}, 31, 1
-; 64R2: mtc1 $[[INS]], $f0
%add = fadd float %d, 1.000000e+00
%conv = fptrunc double %f to float
@@ -33,26 +59,51 @@ entry:
}
define double @func3(double %d, float %f) nounwind readnone {
+; 64-LABEL: func3:
+; 64: # %bb.0: # %entry
+; 64-NEXT: lui $1, %highest(.LCPI1_0)
+; 64-NEXT: daddiu $1, $1, %higher(.LCPI1_0)
+; 64-NEXT: dsll $1, $1, 16
+; 64-NEXT: daddiu $1, $1, %hi(.LCPI1_0)
+; 64-NEXT: dsll $1, $1, 16
+; 64-NEXT: ldc1 $f0, %lo(.LCPI1_0)($1)
+; 64-NEXT: add.d $f0, $f12, $f0
+; 64-NEXT: mfc1 $1, $f13
+; 64-NEXT: daddiu $2, $zero, 1
+; 64-NEXT: dmfc1 $3, $f0
+; 64-NEXT: dsll $2, $2, 63
+; 64-NEXT: daddiu $2, $2, -1
+; 64-NEXT: and $2, $3, $2
+; 64-NEXT: srl $1, $1, 31
+; 64-NEXT: dsll $1, $1, 63
+; 64-NEXT: or $1, $2, $1
+; 64-NEXT: jr $ra
+; 64-NEXT: dmtc1 $1, $f0
+;
+; 64R2-LABEL: func3:
+; 64R2: # %bb.0: # %entry
+; 64R2-NEXT: lui $1, %highest(.LCPI1_0)
+; 64R2-NEXT: daddiu $1, $1, %higher(.LCPI1_0)
+; 64R2-NEXT: dsll $1, $1, 16
+; 64R2-NEXT: daddiu $1, $1, %hi(.LCPI1_0)
+; 64R2-NEXT: dsll $1, $1, 16
+; 64R2-NEXT: ldc1 $f0, %lo(.LCPI1_0)($1)
+; 64R2-NEXT: add.d $f0, $f12, $f0
+; 64R2-NEXT: dmfc1 $1, $f0
+; 64R2-NEXT: mfc1 $2, $f13
+; 64R2-NEXT: ext $2, $2, 31, 1
+; 64R2-NEXT: dext $2, $2, 0, 32
+; 64R2-NEXT: dinsu $1, $2, 63, 1
+; 64R2-NEXT: jr $ra
+; 64R2-NEXT: dmtc1 $1, $f0
entry:
-; ALL-LABEL: func3:
-
-; 64: mfc1 $[[MFC:[0-9]+]], $f13
-; 64: daddiu $[[R1:[0-9]+]], $zero, 1
-; 64: dmfc1 $[[R0:[0-9]+]], ${{.*}}
-; 64: dsll $[[R2:[0-9]+]], $[[R1]], 63
-; 64: daddiu $[[R3:[0-9]+]], $[[R2]], -1
-; 64: and $[[AND0:[0-9]+]], $[[R0]], $[[R3]]
-; 64: srl $[[SRL:[0-9]+]], $[[MFC:[0-9]+]], 31
-; 64: dsll $[[DSLL:[0-9]+]], $[[SRL]], 63
-; 64: or $[[OR:[0-9]+]], $[[AND0]], $[[DSLL]]
-; 64: dmtc1 $[[OR]], $f0
-
-; 64R2: ext ${{[0-9]+}}, ${{[0-9]+}}, 31, 1
-; 64R2: dinsu $[[INS:[0-9]+]], ${{[0-9]+}}, 63, 1
-; 64R2: dmtc1 $[[INS]], $f0
+
+
%add = fadd double %d, 1.000000e+00
%conv = fpext float %f to double
%call = tail call double @copysign(double %add, double %conv) nounwind readnone
ret double %call
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL: {{.*}}
diff --git a/llvm/test/CodeGen/Mips/fcopysign.ll b/llvm/test/CodeGen/Mips/fcopysign.ll
index 810d0f9580861c..167354aaf085a4 100644
--- a/llvm/test/CodeGen/Mips/fcopysign.ll
+++ b/llvm/test/CodeGen/Mips/fcopysign.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -verify-machineinstrs -march=mipsel -mcpu=mips32 \
; RUN: | FileCheck %s -check-prefix=32
; RUN: llc < %s -verify-machineinstrs -march=mipsel -mcpu=mips32r2 \
@@ -10,31 +11,58 @@
; RUN: | FileCheck %s -check-prefix=64R2
define double @func0(double %d0, double %d1) nounwind readnone {
-entry:
+; 32-LABEL: func0:
+; 32: # %bb.0: # %entry
+; 32-NEXT: mfc1 $1, $f15
+; 32-NEXT: lui $2, 32768
+; 32-NEXT: and $1, $1, $2
+; 32-NEXT: lui $2, 32767
+; 32-NEXT: ori $2, $2, 65535
+; 32-NEXT: mfc1 $3, $f13
+; 32-NEXT: and $2, $3, $2
+; 32-NEXT: or $1, $2, $1
+; 32-NEXT: mfc1 $2, $f12
+; 32-NEXT: mtc1 $2, $f0
+; 32-NEXT: jr $ra
+; 32-NEXT: mtc1 $1, $f1
+;
+; 32R2-LABEL: func0:
+; 32R2: # %bb.0: # %entry
+; 32R2-NEXT: mfhc1 $1, $f12
+; 32R2-NEXT: mfhc1 $2, $f14
+; 32R2-NEXT: ext $2, $2, 31, 1
+; 32R2-NEXT: ins $1, $2, 31, 1
+; 32R2-NEXT: mfc1 $2, $f12
+; 32R2-NEXT: mtc1 $2, $f0
+; 32R2-NEXT: mthc1 $1, $f0
+; 32R2-NEXT: jr $ra
+; 32R2-NEXT: nop
;
-; 32: lui $[[MSK1:[0-9]+]], 32768
-; 32: and $[[AND1:[0-9]+]], ${{[0-9]+}}, $[[MSK1]]
-; 32: lui $[[T0:[0-9]+]], 32767
-; 32: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
-; 32: and $[[AND0:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 32: or $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
-; 32: mtc1 $[[OR]], $f1
+; 64-LABEL: func0:
+; 64: # %bb.0: # %entry
+; 64-NEXT: daddiu $1, $zero, 1
+; 64-NEXT: dsll $1, $1, 63
+; 64-NEXT: dmfc1 $2, $f13
+; 64-NEXT: and $2, $2, $1
+; 64-NEXT: dmfc1 $3, $f12
+; 64-NEXT: daddiu $1, $1, -1
+; 64-NEXT: and $1, $3, $1
+; 64-NEXT: or $1, $1, $2
+; 64-NEXT: jr $ra
+; 64-NEXT: dmtc1 $1, $f0
+;
+; 64R2-LABEL: func0:
+; 64R2: # %bb.0: # %entry
+; 64R2-NEXT: dmfc1 $1, $f12
+; 64R2-NEXT: dmfc1 $2, $f13
+; 64R2-NEXT: dextu $2, $2, 63, 1
+; 64R2-NEXT: dinsu $1, $2, 63, 1
+; 64R2-NEXT: jr $ra
+; 64R2-NEXT: dmtc1 $1, $f0
+entry:
-; 32R2: ext $[[EXT:[0-9]+]], ${{[0-9]+}}, 31, 1
-; 32R2: ins $[[INS:[0-9]+]], $[[EXT]], 31, 1
-; 32R2: mthc1 $[[INS]], $f0
-; 64: daddiu $[[T0:[0-9]+]], $zero, 1
-; 64: dsll $[[MSK1:[0-9]+]], $[[T0]], 63
-; 64: and $[[AND1:[0-9]+]], ${{[0-9]+}}, $[[MSK1]]
-; 64: daddiu $[[MSK0:[0-9]+]], $[[MSK1]], -1
-; 64: and $[[AND0:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 64: or $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
-; 64: dmtc1 $[[OR]], $f0
-; 64R2: dextu $[[EXT:[0-9]+]], ${{[0-9]+}}, 63, 1
-; 64R2: dinsu $[[INS:[0-9]+]], $[[EXT]], 63, 1
-; 64R2: dmtc1 $[[INS]], $f0
%call = tail call double @copysign(double %d0, double %d1) nounwind readnone
ret double %call
@@ -43,19 +71,52 @@ entry:
declare double @copysign(double, double) nounwind readnone
define float @func1(float %f0, float %f1) nounwind readnone {
+; 32-LABEL: func1:
+; 32: # %bb.0: # %entry
+; 32-NEXT: mfc1 $1, $f14
+; 32-NEXT: lui $2, 32768
+; 32-NEXT: and $1, $1, $2
+; 32-NEXT: lui $2, 32767
+; 32-NEXT: ori $2, $2, 65535
+; 32-NEXT: mfc1 $3, $f12
+; 32-NEXT: and $2, $3, $2
+; 32-NEXT: or $1, $2, $1
+; 32-NEXT: jr $ra
+; 32-NEXT: mtc1 $1, $f0
+;
+; 32R2-LABEL: func1:
+; 32R2: # %bb.0: # %entry
+; 32R2-NEXT: mfc1 $1, $f12
+; 32R2-NEXT: mfc1 $2, $f14
+; 32R2-NEXT: ext $2, $2, 31, 1
+; 32R2-NEXT: ins $1, $2, 31, 1
+; 32R2-NEXT: jr $ra
+; 32R2-NEXT: mtc1 $1, $f0
+;
+; 64-LABEL: func1:
+; 64: # %bb.0: # %entry
+; 64-NEXT: mfc1 $1, $f13
+; 64-NEXT: lui $2, 32768
+; 64-NEXT: and $1, $1, $2
+; 64-NEXT: lui $2, 32767
+; 64-NEXT: ori $2, $2, 65535
+; 64-NEXT: mfc1 $3, $f12
+; 64-NEXT: and $2, $3, $2
+; 64-NEXT: or $1, $2, $1
+; 64-NEXT: jr $ra
+; 64-NEXT: mtc1 $1, $f0
+;
+; 64R2-LABEL: func1:
+; 64R2: # %bb.0: # %entry
+; 64R2-NEXT: mfc1 $1, $f12
+; 64R2-NEXT: mfc1 $2, $f13
+; 64R2-NEXT: ext $2, $2, 31, 1
+; 64R2-NEXT: ins $1, $2, 31, 1
+; 64R2-NEXT: jr $ra
+; 64R2-NEXT: mtc1 $1, $f0
entry:
-; 32: lui $[[MSK1:[0-9]+]], 32768
-; 32: and $[[AND1:[0-9]+]], ${{[0-9]+}}, $[[MSK1]]
-; 32: lui $[[T0:[0-9]+]], 32767
-; 32: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
-; 32: and $[[AND0:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 32: or $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
-; 32: mtc1 $[[OR]], $f0
-; 32R2: ext $[[EXT:[0-9]+]], ${{[0-9]+}}, 31, 1
-; 32R2: ins $[[INS:[0-9]+]], $[[EXT]], 31, 1
-; 32R2: mtc1 $[[INS]], $f0
%call = tail call float @copysignf(float %f0, float %f1) nounwind readnone
ret float %call
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/abs.ll b/llvm/test/CodeGen/Mips/llvm-ir/abs.ll
index c0812977e3a11b..ea0e34fb2b0856 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/abs.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/abs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -march=mips -mcpu=mips32 -asm-show-inst < %s | FileCheck %s --check-prefix=MIPS32
; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+abs2008,+fp64 -asm-show-inst < %s | FileCheck %s --check-prefix=MIPS32FP64
; RUN: llc -march=mips -mcpu=mips32r3 -mattr=+abs2008,+micromips -asm-show-inst < %s | FileCheck %s --check-prefix=MM
@@ -5,23 +6,123 @@
; RUN: llc -march=mips -mcpu=mips32r6 -mattr=+micromips -asm-show-inst < %s | FileCheck %s --check-prefix=MMR6
define float @abs_s(float %a) {
-; MIPS32: {{(ori|ins)}}
-; MIPS32-NOT: abs.s
-; MIPS32FP64: abs.s {{.*}} # <MCInst #{{[0-9]+}} FABS_S
-; MM: abs.s {{.*}} # <MCInst #{{[0-9]+}} FABS_S_MM
-; MMFP64: abs.s {{.*}} # <MCInst #{{[0-9]+}} FABS_S_MM
-; MMR6: abs.s {{.*}} # <MCInst #{{[0-9]+}} FABS_S_MM
+; MIPS32-LABEL: abs_s:
+; MIPS32: # %bb.0:
+; MIPS32-NEXT: lui $1, 32767 # <MCInst #[[#MCINST1:]] LUi
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32-NEXT: # <MCOperand Imm:32767>>
+; MIPS32-NEXT: ori $1, $1, 65535 # <MCInst #[[#MCINST2:]] ORi
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Imm:65535>>
+; MIPS32-NEXT: mfc1 $2, $f12 # <MCInst #[[#MCINST3:]] MFC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2:]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+; MIPS32-NEXT: and $1, $2, $1 # <MCInst #[[#MCINST4:]] AND
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
+; MIPS32-NEXT: jr $ra # <MCInst #[[#MCINST5:]] JR
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
+; MIPS32-NEXT: mtc1 $1, $f0 # <MCInst #[[#MCINST6:]] MTC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
+;
+; MIPS32FP64-LABEL: abs_s:
+; MIPS32FP64: # %bb.0:
+; MIPS32FP64-NEXT: jr $ra # <MCInst #[[#MCINST5:]] JR
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
+; MIPS32FP64-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST7:]] FABS_S
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+;
+; MM-LABEL: abs_s:
+; MM: # %bb.0:
+; MM-NEXT: jr $ra # <MCInst #[[#MCINST8:]] JR_MM
+; MM-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
+; MM-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST9:]] FABS_S_MM
+; MM-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MM-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+;
+; MMFP64-LABEL: abs_s:
+; MMFP64: # %bb.0:
+; MMFP64-NEXT: jr $ra # <MCInst #[[#MCINST8:]] JR_MM
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
+; MMFP64-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST9:]] FABS_S_MM
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+;
+; MMR6-LABEL: abs_s:
+; MMR6: # %bb.0:
+; MMR6-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST9:]] FABS_S_MM
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+; MMR6-NEXT: jrc $ra # <MCInst #[[#MCINST10:]] JRC16_MM
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
%ret = call float @llvm.fabs.f32(float %a)
ret float %ret
}
define double @abs_d(double %a) {
-; MIPS32: {{(ori|ins|dsll)}}
-; MIPS32-NOT: abs.d
-; MIPS32FP64: abs.d {{.*}} # <MCInst #{{[0-9]+}} FABS_D64
-; MM: abs.d {{.*}} # <MCInst #{{[0-9]+}} FABS_D32_MM
-; MMFP64: abs.d {{.*}} # <MCInst #{{[0-9]+}} FABS_D64_MM
-; MMR6: abs.d {{.*}} # <MCInst #{{[0-9]+}} FABS_D64_MM
+; MIPS32-LABEL: abs_d:
+; MIPS32: # %bb.0:
+; MIPS32-NEXT: lui $1, 32767 # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Imm:32767>>
+; MIPS32-NEXT: ori $1, $1, 65535 # <MCInst #[[#MCINST2]] ORi
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Imm:65535>>
+; MIPS32-NEXT: mfc1 $2, $f13 # <MCInst #[[#MCINST3]] MFC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS32-NEXT: and $1, $2, $1 # <MCInst #[[#MCINST4]] AND
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
+; MIPS32-NEXT: mfc1 $2, $f12 # <MCInst #[[#MCINST3]] MFC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG3]]>>
+; MIPS32-NEXT: mtc1 $2, $f0 # <MCInst #[[#MCINST6]] MTC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT: jr $ra # <MCInst #[[#MCINST5]] JR
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
+; MIPS32-NEXT: mtc1 $1, $f1 # <MCInst #[[#MCINST6]] MTC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG7:]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
+;
+; MIPS32FP64-LABEL: abs_d:
+; MIPS32FP64: # %bb.0:
+; MIPS32FP64-NEXT: jr $ra # <MCInst #[[#MCINST5]] JR
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
+; MIPS32FP64-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST11:]] FABS_D64
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG8:]]>
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG9:]]>>
+;
+; MM-LABEL: abs_d:
+; MM: # %bb.0:
+; MM-NEXT: jr $ra # <MCInst #[[#MCINST8]] JR_MM
+; MM-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
+; MM-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST12:]] FABS_D32_MM
+; MM-NEXT: # <MCOperand Reg:[[#MCREG10:]]>
+; MM-NEXT: # <MCOperand Reg:[[#MCREG11:]]>>
+;
+; MMFP64-LABEL: abs_d:
+; MMFP64: # %bb.0:
+; MMFP64-NEXT: jr $ra # <MCInst #[[#MCINST8]] JR_MM
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
+; MMFP64-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST13:]] FABS_D64_MM
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG8:]]>
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG9:]]>>
+;
+; MMR6-LABEL: abs_d:
+; MMR6: # %bb.0:
+; MMR6-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST13:]] FABS_D64_MM
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG8:]]>
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG9:]]>>
+; MMR6-NEXT: jrc $ra # <MCInst #[[#MCINST10]] JRC16_MM
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
%ret = call double @llvm.fabs.f64(double %a)
ret double %ret
}
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index f041f202777f61..08aa26bd340396 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32
; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
@@ -22,6 +23,44 @@
; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
+; CHECK32-LABEL: static_offset(
+; CHECK32: {
+; CHECK32-NEXT: .reg .pred %p<2>;
+; CHECK32-NEXT: .reg .b32 %r<7>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0: // %bb
+; CHECK32-NEXT: ld.param.u32 %r5, [static_offset_param_2];
+; CHECK32-NEXT: setp.ne.s32 %p1, %r5, 3;
+; CHECK32-NEXT: @%p1 bra $L__BB0_2;
+; CHECK32-NEXT: // %bb.1: // %bb3
+; CHECK32-NEXT: ld.param.u32 %r3, [static_offset_param_0];
+; CHECK32-NEXT: mov.b32 %r4, static_offset_param_1;
+; CHECK32-NEXT: mov.u32 %r1, %r4;
+; CHECK32-NEXT: cvta.to.global.u32 %r2, %r3;
+; CHECK32-NEXT: ld.param.u32 %r6, [%r1+12];
+; CHECK32-NEXT: st.global.u32 [%r2], %r6;
+; CHECK32-NEXT: $L__BB0_2: // %bb6
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: static_offset(
+; CHECK64: {
+; CHECK64-NEXT: .reg .pred %p<2>;
+; CHECK64-NEXT: .reg .b32 %r<3>;
+; CHECK64-NEXT: .reg .b64 %rd<5>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0: // %bb
+; CHECK64-NEXT: ld.param.u32 %r1, [static_offset_param_2];
+; CHECK64-NEXT: setp.ne.s32 %p1, %r1, 3;
+; CHECK64-NEXT: @%p1 bra $L__BB0_2;
+; CHECK64-NEXT: // %bb.1: // %bb3
+; CHECK64-NEXT: ld.param.u64 %rd3, [static_offset_param_0];
+; CHECK64-NEXT: mov.b64 %rd4, static_offset_param_1;
+; CHECK64-NEXT: mov.u64 %rd1, %rd4;
+; CHECK64-NEXT: cvta.to.global.u64 %rd2, %rd3;
+; CHECK64-NEXT: ld.param.u32 %r2, [%rd1+12];
+; CHECK64-NEXT: st.global.u32 [%rd2], %r2;
+; CHECK64-NEXT: $L__BB0_2: // %bb6
+; CHECK64-NEXT: ret;
bb:
%tmp = icmp eq i32 %arg2, 3
br i1 %tmp, label %bb3, label %bb6
@@ -56,6 +95,38 @@ bb6: ; preds = %bb3, %bb
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
+; CHECK32-LABEL: dynamic_offset(
+; CHECK32: {
+; CHECK32-NEXT: .reg .b32 %r<9>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0: // %bb
+; CHECK32-NEXT: ld.param.u32 %r1, [dynamic_offset_param_0];
+; CHECK32-NEXT: mov.b32 %r2, dynamic_offset_param_1;
+; CHECK32-NEXT: mov.u32 %r3, %r2;
+; CHECK32-NEXT: cvta.to.global.u32 %r4, %r1;
+; CHECK32-NEXT: ld.param.u32 %r5, [dynamic_offset_param_2];
+; CHECK32-NEXT: shl.b32 %r6, %r5, 2;
+; CHECK32-NEXT: add.s32 %r7, %r3, %r6;
+; CHECK32-NEXT: ld.param.u32 %r8, [%r7];
+; CHECK32-NEXT: st.global.u32 [%r4], %r8;
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: dynamic_offset(
+; CHECK64: {
+; CHECK64-NEXT: .reg .b32 %r<3>;
+; CHECK64-NEXT: .reg .b64 %rd<7>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0: // %bb
+; CHECK64-NEXT: ld.param.u64 %rd1, [dynamic_offset_param_0];
+; CHECK64-NEXT: mov.b64 %rd2, dynamic_offset_param_1;
+; CHECK64-NEXT: mov.u64 %rd3, %rd2;
+; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; CHECK64-NEXT: ld.param.u32 %r1, [dynamic_offset_param_2];
+; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
+; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
+; CHECK64-NEXT: ld.param.u32 %r2, [%rd6];
+; CHECK64-NEXT: st.global.u32 [%rd4], %r2;
+; CHECK64-NEXT: ret;
bb:
%tmp = sext i32 %arg2 to i64
%tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
@@ -81,6 +152,40 @@ bb:
;
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @gep_bitcast(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
+; CHECK32-LABEL: gep_bitcast(
+; CHECK32: {
+; CHECK32-NEXT: .reg .b16 %rs<2>;
+; CHECK32-NEXT: .reg .b32 %r<8>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0: // %bb
+; CHECK32-NEXT: ld.param.u32 %r1, [gep_bitcast_param_0];
+; CHECK32-NEXT: mov.b32 %r2, gep_bitcast_param_1;
+; CHECK32-NEXT: mov.u32 %r3, %r2;
+; CHECK32-NEXT: cvta.to.global.u32 %r4, %r1;
+; CHECK32-NEXT: ld.param.u32 %r5, [gep_bitcast_param_2];
+; CHECK32-NEXT: shl.b32 %r6, %r5, 2;
+; CHECK32-NEXT: add.s32 %r7, %r3, %r6;
+; CHECK32-NEXT: ld.param.u8 %rs1, [%r7];
+; CHECK32-NEXT: st.global.u8 [%r4], %rs1;
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: gep_bitcast(
+; CHECK64: {
+; CHECK64-NEXT: .reg .b16 %rs<2>;
+; CHECK64-NEXT: .reg .b32 %r<2>;
+; CHECK64-NEXT: .reg .b64 %rd<7>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0: // %bb
+; CHECK64-NEXT: ld.param.u64 %rd1, [gep_bitcast_param_0];
+; CHECK64-NEXT: mov.b64 %rd2, gep_bitcast_param_1;
+; CHECK64-NEXT: mov.u64 %rd3, %rd2;
+; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; CHECK64-NEXT: ld.param.u32 %r1, [gep_bitcast_param_2];
+; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
+; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
+; CHECK64-NEXT: ld.param.u8 %rs1, [%rd6];
+; CHECK64-NEXT: st.global.u8 [%rd4], %rs1;
+; CHECK64-NEXT: ret;
bb:
%n64 = sext i32 %n to i64
%gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
@@ -106,6 +211,40 @@ bb:
;
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @gep_bitcast_asc(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
+; CHECK32-LABEL: gep_bitcast_asc(
+; CHECK32: {
+; CHECK32-NEXT: .reg .b16 %rs<2>;
+; CHECK32-NEXT: .reg .b32 %r<8>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0: // %bb
+; CHECK32-NEXT: ld.param.u32 %r1, [gep_bitcast_asc_param_0];
+; CHECK32-NEXT: mov.b32 %r2, gep_bitcast_asc_param_1;
+; CHECK32-NEXT: mov.u32 %r3, %r2;
+; CHECK32-NEXT: cvta.to.global.u32 %r4, %r1;
+; CHECK32-NEXT: ld.param.u32 %r5, [gep_bitcast_asc_param_2];
+; CHECK32-NEXT: shl.b32 %r6, %r5, 2;
+; CHECK32-NEXT: add.s32 %r7, %r3, %r6;
+; CHECK32-NEXT: ld.param.u8 %rs1, [%r7];
+; CHECK32-NEXT: st.global.u8 [%r4], %rs1;
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: gep_bitcast_asc(
+; CHECK64: {
+; CHECK64-NEXT: .reg .b16 %rs<2>;
+; CHECK64-NEXT: .reg .b32 %r<2>;
+; CHECK64-NEXT: .reg .b64 %rd<7>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0: // %bb
+; CHECK64-NEXT: ld.param.u64 %rd1, [gep_bitcast_asc_param_0];
+; CHECK64-NEXT: mov.b64 %rd2, gep_bitcast_asc_param_1;
+; CHECK64-NEXT: mov.u64 %rd3, %rd2;
+; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; CHECK64-NEXT: ld.param.u32 %r1, [gep_bitcast_asc_param_2];
+; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
+; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
+; CHECK64-NEXT: ld.param.u8 %rs1, [%rd6];
+; CHECK64-NEXT: st.global.u8 [%rd4], %rs1;
+; CHECK64-NEXT: ret;
bb:
%n64 = sext i32 %n to i64
%gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
@@ -140,6 +279,84 @@ bb:
; Function Attrs: convergent norecurse nounwind mustprogress
define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
+; CHECK32-LABEL: pointer_escapes(
+; CHECK32: {
+; CHECK32-NEXT: .local .align 4 .b8 __local_depot4[16];
+; CHECK32-NEXT: .reg .b32 %SP;
+; CHECK32-NEXT: .reg .b32 %SPL;
+; CHECK32-NEXT: .reg .b32 %r<16>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0: // %bb
+; CHECK32-NEXT: mov.u32 %SPL, __local_depot4;
+; CHECK32-NEXT: ld.param.u32 %r1, [pointer_escapes_param_0];
+; CHECK32-NEXT: add.u32 %r3, %SPL, 0;
+; CHECK32-NEXT: ld.param.u32 %r4, [pointer_escapes_param_2];
+; CHECK32-NEXT: ld.param.u32 %r5, [pointer_escapes_param_1+12];
+; CHECK32-NEXT: ld.param.u32 %r6, [pointer_escapes_param_1+8];
+; CHECK32-NEXT: ld.param.u32 %r7, [pointer_escapes_param_1+4];
+; CHECK32-NEXT: ld.param.u32 %r8, [pointer_escapes_param_1];
+; CHECK32-NEXT: st.local.u32 [%r3], %r8;
+; CHECK32-NEXT: st.local.u32 [%r3+4], %r7;
+; CHECK32-NEXT: st.local.u32 [%r3+8], %r6;
+; CHECK32-NEXT: st.local.u32 [%r3+12], %r5;
+; CHECK32-NEXT: cvta.to.global.u32 %r9, %r1;
+; CHECK32-NEXT: shl.b32 %r10, %r4, 2;
+; CHECK32-NEXT: add.s32 %r11, %r3, %r10;
+; CHECK32-NEXT: cvta.local.u32 %r12, %r11;
+; CHECK32-NEXT: ld.local.u32 %r13, [%r11];
+; CHECK32-NEXT: st.global.u32 [%r9], %r13;
+; CHECK32-NEXT: { // callseq 0, 0
+; CHECK32-NEXT: .param .b32 param0;
+; CHECK32-NEXT: st.param.b32 [param0+0], %r12;
+; CHECK32-NEXT: .param .b32 retval0;
+; CHECK32-NEXT: call.uni (retval0),
+; CHECK32-NEXT: escape,
+; CHECK32-NEXT: (
+; CHECK32-NEXT: param0
+; CHECK32-NEXT: );
+; CHECK32-NEXT: ld.param.b32 %r14, [retval0+0];
+; CHECK32-NEXT: } // callseq 0
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: pointer_escapes(
+; CHECK64: {
+; CHECK64-NEXT: .local .align 4 .b8 __local_depot4[16];
+; CHECK64-NEXT: .reg .b64 %SP;
+; CHECK64-NEXT: .reg .b64 %SPL;
+; CHECK64-NEXT: .reg .b32 %r<7>;
+; CHECK64-NEXT: .reg .b64 %rd<10>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0: // %bb
+; CHECK64-NEXT: mov.u64 %SPL, __local_depot4;
+; CHECK64-NEXT: ld.param.u64 %rd1, [pointer_escapes_param_0];
+; CHECK64-NEXT: add.u64 %rd3, %SPL, 0;
+; CHECK64-NEXT: ld.param.u32 %r1, [pointer_escapes_param_2];
+; CHECK64-NEXT: ld.param.u32 %r2, [pointer_escapes_param_1+12];
+; CHECK64-NEXT: ld.param.u32 %r3, [pointer_escapes_param_1+8];
+; CHECK64-NEXT: ld.param.u32 %r4, [pointer_escapes_param_1+4];
+; CHECK64-NEXT: ld.param.u32 %r5, [pointer_escapes_param_1];
+; CHECK64-NEXT: st.local.u32 [%rd3], %r5;
+; CHECK64-NEXT: st.local.u32 [%rd3+4], %r4;
+; CHECK64-NEXT: st.local.u32 [%rd3+8], %r3;
+; CHECK64-NEXT: st.local.u32 [%rd3+12], %r2;
+; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
+; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
+; CHECK64-NEXT: cvta.local.u64 %rd7, %rd6;
+; CHECK64-NEXT: ld.local.u32 %r6, [%rd6];
+; CHECK64-NEXT: st.global.u32 [%rd4], %r6;
+; CHECK64-NEXT: { // callseq 0, 0
+; CHECK64-NEXT: .param .b64 param0;
+; CHECK64-NEXT: st.param.b64 [param0+0], %rd7;
+; CHECK64-NEXT: .param .b64 retval0;
+; CHECK64-NEXT: call.uni (retval0),
+; CHECK64-NEXT: escape,
+; CHECK64-NEXT: (
+; CHECK64-NEXT: param0
+; CHECK64-NEXT: );
+; CHECK64-NEXT: ld.param.b64 %rd8, [retval0+0];
+; CHECK64-NEXT: } // callseq 0
+; CHECK64-NEXT: ret;
bb:
%tmp = sext i32 %arg2 to i64
%tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
@@ -164,3 +381,5 @@ declare dso_local ptr @escape(ptr) local_unnamed_addr
!5 = !{ptr @pointer_escapes, !"kernel", i32 1}
!6 = !{ptr @gep_bitcast, !"kernel", i32 1}
!7 = !{ptr @gep_bitcast_asc, !"kernel", i32 1}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/mulwide.ll b/llvm/test/CodeGen/NVPTX/mulwide.ll
index 77c21564c8aa76..9e311c6833d5eb 100644
--- a/llvm/test/CodeGen/NVPTX/mulwide.ll
+++ b/llvm/test/CodeGen/NVPTX/mulwide.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -O3 | FileCheck %s --check-prefix=OPT
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -O0 | FileCheck %s --check-prefix=NOOPT
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -O3 | %ptxas-verify %}
@@ -6,8 +7,6 @@
; OPT-LABEL: @mulwide16
; NOOPT-LABEL: @mulwide16
define i32 @mulwide16(i16 %a, i16 %b) {
-; OPT: mul.wide.s16
-; NOOPT: mul.lo.s32
%val0 = sext i16 %a to i32
%val1 = sext i16 %b to i32
%val2 = mul i32 %val0, %val1
@@ -17,8 +16,6 @@ define i32 @mulwide16(i16 %a, i16 %b) {
; OPT-LABEL: @mulwideu16
; NOOPT-LABEL: @mulwideu16
define i32 @mulwideu16(i16 %a, i16 %b) {
-; OPT: mul.wide.u16
-; NOOPT: mul.lo.s32
%val0 = zext i16 %a to i32
%val1 = zext i16 %b to i32
%val2 = mul i32 %val0, %val1
@@ -28,8 +25,6 @@ define i32 @mulwideu16(i16 %a, i16 %b) {
; OPT-LABEL: @mulwide8
; NOOPT-LABEL: @mulwide8
define i32 @mulwide8(i8 %a, i8 %b) {
-; OPT: mul.wide.s16
-; NOOPT: mul.lo.s32
%val0 = sext i8 %a to i32
%val1 = sext i8 %b to i32
%val2 = mul i32 %val0, %val1
@@ -39,8 +34,6 @@ define i32 @mulwide8(i8 %a, i8 %b) {
; OPT-LABEL: @mulwideu8
; NOOPT-LABEL: @mulwideu8
define i32 @mulwideu8(i8 %a, i8 %b) {
-; OPT: mul.wide.u16
-; NOOPT: mul.lo.s32
%val0 = zext i8 %a to i32
%val1 = zext i8 %b to i32
%val2 = mul i32 %val0, %val1
@@ -50,8 +43,6 @@ define i32 @mulwideu8(i8 %a, i8 %b) {
; OPT-LABEL: @mulwide32
; NOOPT-LABEL: @mulwide32
define i64 @mulwide32(i32 %a, i32 %b) {
-; OPT: mul.wide.s32
-; NOOPT: mul.lo.s64
%val0 = sext i32 %a to i64
%val1 = sext i32 %b to i64
%val2 = mul i64 %val0, %val1
@@ -61,8 +52,6 @@ define i64 @mulwide32(i32 %a, i32 %b) {
; OPT-LABEL: @mulwideu32
; NOOPT-LABEL: @mulwideu32
define i64 @mulwideu32(i32 %a, i32 %b) {
-; OPT: mul.wide.u32
-; NOOPT: mul.lo.s64
%val0 = zext i32 %a to i64
%val1 = zext i32 %b to i64
%val2 = mul i64 %val0, %val1
@@ -72,8 +61,6 @@ define i64 @mulwideu32(i32 %a, i32 %b) {
; OPT-LABEL: @mulwideu7
; NOOPT-LABEL: @mulwideu7
define i64 @mulwideu7(i7 %a, i7 %b) {
-; OPT: mul.wide.u32
-; NOOPT: mul.lo.s64
%val0 = zext i7 %a to i64
%val1 = zext i7 %b to i64
%val2 = mul i64 %val0, %val1
@@ -83,8 +70,6 @@ define i64 @mulwideu7(i7 %a, i7 %b) {
; OPT-LABEL: @mulwides7
; NOOPT-LABEL: @mulwides7
define i64 @mulwides7(i7 %a, i7 %b) {
-; OPT: mul.wide.s32
-; NOOPT: mul.lo.s64
%val0 = sext i7 %a to i64
%val1 = sext i7 %b to i64
%val2 = mul i64 %val0, %val1
@@ -94,8 +79,6 @@ define i64 @mulwides7(i7 %a, i7 %b) {
; OPT-LABEL: @shl30
; NOOPT-LABEL: @shl30
define i64 @shl30(i32 %a) {
-; OPT: mul.wide
-; NOOPT: shl.b64
%conv = sext i32 %a to i64
%shl = shl i64 %conv, 30
ret i64 %shl
@@ -104,9 +87,10 @@ define i64 @shl30(i32 %a) {
; OPT-LABEL: @shl31
; NOOPT-LABEL: @shl31
define i64 @shl31(i32 %a) {
-; OPT-NOT: mul.wide
-; NOOPT-NOT: mul.wide
%conv = sext i32 %a to i64
%shl = shl i64 %conv, 31
ret i64 %shl
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOOPT: {{.*}}
+; OPT: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
index 40a3e9e945a23e..34461914915966 100644
--- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Verifies correctness of load/store of parameters and return values.
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %}
@@ -383,3 +384,5 @@ define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
%r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a)
ret %s_i8f64p %r
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/PowerPC/coalesce-ext.ll b/llvm/test/CodeGen/PowerPC/coalesce-ext.ll
index 67de45f453d5de..bd726d330dbb7b 100644
--- a/llvm/test/CodeGen/PowerPC/coalesce-ext.ll
+++ b/llvm/test/CodeGen/PowerPC/coalesce-ext.ll
@@ -1,18 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -verify-machineinstrs -mcpu=g5 -mtriple=powerpc64-unknown-linux-gnu -ppc-asm-full-reg-names < %s | FileCheck %s
; Check that the peephole optimizer knows about sext and zext instructions.
; CHECK: test1sext
define i32 @test1sext(i64 %A, i64 %B, ptr %P, ptr %P2) nounwind {
+; CHECK-LABEL: test1sext:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add r4, r3, r4
+; CHECK-NEXT: extsw r3, r4
+; CHECK-NEXT: std r3, 0(r6)
+; CHECK-NEXT: add r3, r4, r4
+; CHECK-NEXT: stw r4, 0(r5)
+; CHECK-NEXT: blr
%C = add i64 %A, %B
- ; CHECK: add [[SUM:r[0-9]+]], r3, r4
%D = trunc i64 %C to i32
%E = shl i64 %C, 32
%F = ashr i64 %E, 32
- ; CHECK: extsw [[EXT:r[0-9]+]], [[SUM]]
store volatile i64 %F, ptr %P2
- ; CHECK-DAG: std [[EXT]]
store volatile i32 %D, ptr %P
; Reuse low bits of extended register, don't extend live range of SUM.
- ; CHECK-DAG: stw [[SUM]]
%R = add i32 %D, %D
ret i32 %R
}
diff --git a/llvm/test/CodeGen/PowerPC/extsh.ll b/llvm/test/CodeGen/PowerPC/extsh.ll
index f4c83ec9d0cf3e..c057d3f119ddfd 100644
--- a/llvm/test/CodeGen/PowerPC/extsh.ll
+++ b/llvm/test/CodeGen/PowerPC/extsh.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; This should turn into a single extsh
; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- | grep extsh | count 1
define i32 @test(i32 %X) {
diff --git a/llvm/test/CodeGen/PowerPC/shl_sext.ll b/llvm/test/CodeGen/PowerPC/shl_sext.ll
index cf83944fef8d65..53da81f45ee11d 100644
--- a/llvm/test/CodeGen/PowerPC/shl_sext.ll
+++ b/llvm/test/CodeGen/PowerPC/shl_sext.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; This test should not contain a sign extend
; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- | not grep extsb
diff --git a/llvm/test/CodeGen/SystemZ/int-abs-01.ll b/llvm/test/CodeGen/SystemZ/int-abs-01.ll
index 053c347c0b7560..7bdf622ed67d1a 100644
--- a/llvm/test/CodeGen/SystemZ/int-abs-01.ll
+++ b/llvm/test/CodeGen/SystemZ/int-abs-01.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Test integer absolute.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
@@ -5,8 +6,9 @@
; Test i32->i32 absolute using slt.
define i32 @f1(i32 %val) {
; CHECK-LABEL: f1:
-; CHECK: lpr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp slt i32 %val, 0
%neg = sub i32 0, %val
%res = select i1 %cmp, i32 %neg, i32 %val
@@ -16,8 +18,9 @@ define i32 @f1(i32 %val) {
; Test i32->i32 absolute using sle.
define i32 @f2(i32 %val) {
; CHECK-LABEL: f2:
-; CHECK: lpr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sle i32 %val, 0
%neg = sub i32 0, %val
%res = select i1 %cmp, i32 %neg, i32 %val
@@ -27,8 +30,9 @@ define i32 @f2(i32 %val) {
; Test i32->i32 absolute using sgt.
define i32 @f3(i32 %val) {
; CHECK-LABEL: f3:
-; CHECK: lpr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sgt i32 %val, 0
%neg = sub i32 0, %val
%res = select i1 %cmp, i32 %val, i32 %neg
@@ -38,8 +42,9 @@ define i32 @f3(i32 %val) {
; Test i32->i32 absolute using sge.
define i32 @f4(i32 %val) {
; CHECK-LABEL: f4:
-; CHECK: lpr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sge i32 %val, 0
%neg = sub i32 0, %val
%res = select i1 %cmp, i32 %val, i32 %neg
@@ -49,8 +54,9 @@ define i32 @f4(i32 %val) {
; Test i32->i64 absolute.
define i64 @f5(i32 %val) {
; CHECK-LABEL: f5:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%ext = sext i32 %val to i64
%cmp = icmp slt i64 %ext, 0
%neg = sub i64 0, %ext
@@ -61,8 +67,9 @@ define i64 @f5(i32 %val) {
; Test i32->i64 absolute that uses an "in-register" form of sign extension.
define i64 @f6(i64 %val) {
; CHECK-LABEL: f6:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%trunc = trunc i64 %val to i32
%ext = sext i32 %trunc to i64
%cmp = icmp slt i64 %ext, 0
@@ -74,8 +81,9 @@ define i64 @f6(i64 %val) {
; Test i64 absolute.
define i64 @f7(i64 %val) {
; CHECK-LABEL: f7:
-; CHECK: lpgr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp slt i64 %val, 0
%neg = sub i64 0, %val
%res = select i1 %cmp, i64 %neg, i64 %val
@@ -85,8 +93,9 @@ define i64 @f7(i64 %val) {
; Test another form of f6, which is that produced by InstCombine.
define i64 @f8(i64 %val) {
; CHECK-LABEL: f8:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -98,8 +107,9 @@ define i64 @f8(i64 %val) {
; Try again with sle rather than slt.
define i64 @f9(i64 %val) {
; CHECK-LABEL: f9:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -111,8 +121,9 @@ define i64 @f9(i64 %val) {
; Repeat f8 with the operands reversed.
define i64 @f10(i64 %val) {
; CHECK-LABEL: f10:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -124,8 +135,9 @@ define i64 @f10(i64 %val) {
; Try again with sge rather than sgt.
define i64 @f11(i64 %val) {
; CHECK-LABEL: f11:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -137,8 +149,9 @@ define i64 @f11(i64 %val) {
; Repeat f5 with the comparison on the unextended value.
define i64 @f12(i32 %val) {
; CHECK-LABEL: f12:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%ext = sext i32 %val to i64
%cmp = icmp slt i32 %val, 0
%neg = sub i64 0, %ext
diff --git a/llvm/test/CodeGen/SystemZ/int-cmp-44.ll b/llvm/test/CodeGen/SystemZ/int-cmp-44.ll
index 41ace057706c3c..559dcfe25d237f 100644
--- a/llvm/test/CodeGen/SystemZ/int-cmp-44.ll
+++ b/llvm/test/CodeGen/SystemZ/int-cmp-44.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Test that compares are omitted if CC already has the right value
; (z10 version).
;
@@ -11,9 +12,12 @@ declare void @foo()
; First test the EQ case.
define i32 @f1(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f1:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: ber %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: ber %r14
+; CHECK-NEXT: .LBB0_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp eq i32 %res, 0
@@ -30,9 +34,12 @@ exit:
; ...and again with NE.
define i32 @f2(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f2:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: blhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB1_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp ne i32 %res, 0
@@ -49,8 +56,12 @@ exit:
; ...and again with SLT.
define i32 @f3(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f3:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: blr %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB2_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp slt i32 %res, 0
@@ -67,8 +78,12 @@ exit:
; ...and again with SLE.
define i32 @f4(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f4:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: bler %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB3_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp sle i32 %res, 0
@@ -85,8 +100,12 @@ exit:
; ...and again with SGT.
define i32 @f5(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f5:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: bhr %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB4_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp sgt i32 %res, 0
@@ -103,8 +122,12 @@ exit:
; ...and again with SGE.
define i32 @f6(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f6:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: bher %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB5_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp sge i32 %res, 0
@@ -122,9 +145,12 @@ exit:
; zero even without "nsw".
define i32 @f7(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f7:
-; CHECK: s %r2, 0(%r4)
-; CHECK-NEXT: bner %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: s %r2, 0(%r4)
+; CHECK-NEXT: bner %r14
+; CHECK-NEXT: .LBB6_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%cur = load i32, ptr %dest
%res = sub i32 %a, %cur
@@ -142,8 +168,12 @@ exit:
; ...and again with SLT.
define i32 @f8(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f8:
-; CHECK: s %r2, 0(%r4)
-; CHECK-NEXT: blr %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: s %r2, 0(%r4)
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB7_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%cur = load i32, ptr %dest
%res = sub nsw i32 %a, %cur
@@ -162,9 +192,12 @@ exit:
; comparisons with zero.
define i32 @f9(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f9:
-; CHECK: nr %r2, %r3
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: nr %r2, %r3
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB8_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i32 %a, %b
%cmp = icmp ne i32 %res, 0
@@ -181,9 +214,12 @@ exit:
; ...but not for ordered comparisons.
define i32 @f10(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f10:
-; CHECK: nr %r2, %r3
-; CHECK-NEXT: cibl %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: nr %r2, %r3
+; CHECK-NEXT: cibl %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB9_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i32 %a, %b
%cmp = icmp slt i32 %res, 0
@@ -201,9 +237,12 @@ exit:
; comparisons with zero if the immediate covers the whole register.
define i32 @f11(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f11:
-; CHECK: nilf %r2, 100000001
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: nilf %r2, 100000001
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB10_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i32 %a, 100000001
%cmp = icmp ne i32 %res, 0
@@ -221,9 +260,12 @@ exit:
; zero results.
define i32 @f12(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f12:
-; CHECK: nill %r2, 65436
-; CHECK-NEXT: ciblh %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: nill %r2, 65436
+; CHECK-NEXT: ciblh %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB11_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i32 %a, -100
%cmp = icmp ne i32 %res, 0
@@ -240,9 +282,12 @@ exit:
; SRA provides the same CC result as a comparison with zero.
define i32 @f13(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f13:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: ber %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: ber %r14
+; CHECK-NEXT: .LBB12_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp eq i32 %res, 0
@@ -259,9 +304,12 @@ exit:
; ...and again with NE.
define i32 @f14(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f14:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: blhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB13_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp ne i32 %res, 0
@@ -278,9 +326,12 @@ exit:
; ...and SLT.
define i32 @f15(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f15:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB14_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp slt i32 %res, 0
@@ -297,9 +348,12 @@ exit:
; ...and SLE.
define i32 @f16(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f16:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: bler %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB15_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp sle i32 %res, 0
@@ -316,9 +370,12 @@ exit:
; ...and SGT.
define i32 @f17(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f17:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB16_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp sgt i32 %res, 0
@@ -335,9 +392,12 @@ exit:
; ...and SGE.
define i32 @f18(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f18:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: bher %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB17_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp sge i32 %res, 0
@@ -355,9 +415,12 @@ exit:
; Test the EQ case.
define i64 @f19(i64 %a, i64 %b, ptr %dest) {
; CHECK-LABEL: f19:
-; CHECK: risbg %r2, %r3, 0, 190, 0
-; CHECK-NEXT: ber %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: risbg %r2, %r3, 0, 190, 0
+; CHECK-NEXT: ber %r14
+; CHECK-NEXT: .LBB18_1: # %store
+; CHECK-NEXT: stg %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i64 %b, -2
%cmp = icmp eq i64 %res, 0
@@ -374,9 +437,12 @@ exit:
; ...and the SLT case.
define i64 @f20(i64 %a, i64 %b, ptr %dest) {
; CHECK-LABEL: f20:
-; CHECK: risbg %r2, %r3, 0, 190, 0
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: risbg %r2, %r3, 0, 190, 0
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB19_1: # %store
+; CHECK-NEXT: stg %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i64 %b, -2
%cmp = icmp slt i64 %res, 0
@@ -394,12 +460,15 @@ exit:
; instruction.
define i32 @f21(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f21:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: cibe %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: cibe %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB20_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i32 %a, 1000000
%res = call i32 asm "blah $0", "=r,0" (i32 %add)
@@ -417,12 +486,15 @@ exit:
; ...and again with a CC-clobbering instruction.
define i32 @f22(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f22:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: cibe %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: cibe %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB21_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i32 %a, 1000000
%res = call i32 asm "blah $0", "=r,0,~{cc}" (i32 %add)
@@ -440,10 +512,13 @@ exit:
; Check that stores do not interfere.
define i32 @f23(i32 %a, i32 %b, ptr %dest1, ptr %dest2) {
; CHECK-LABEL: f23:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: st %r2, 0(%r4)
-; CHECK-NEXT: blhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB22_1: # %store
+; CHECK-NEXT: st %r3, 0(%r5)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
store i32 %res, ptr %dest1
@@ -461,10 +536,25 @@ exit:
; Check that calls do interfere.
define void @f24(ptr %ptr) {
; CHECK-LABEL: f24:
-; CHECK: afi [[REG:%r[0-9]+]], 1000000
-; CHECK-NEXT: brasl %r14, foo at PLT
-; CHECK-NEXT: cijlh [[REG]], 0, .L{{.*}}
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT: .cfi_offset %r12, -64
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: lgr %r13, %r2
+; CHECK-NEXT: lhi %r12, 1
+; CHECK-NEXT: x %r12, 0(%r2)
+; CHECK-NEXT: afi %r12, 1000000
+; CHECK-NEXT: brasl %r14, foo at PLT
+; CHECK-NEXT: cijlh %r12, 0, .LBB23_2
+; CHECK-NEXT: # %bb.1: # %store
+; CHECK-NEXT: st %r12, 0(%r13)
+; CHECK-NEXT: .LBB23_2: # %exit
+; CHECK-NEXT: lmg %r12, %r15, 256(%r15)
+; CHECK-NEXT: br %r14
entry:
%val = load i32, ptr %ptr
%xor = xor i32 %val, 1
@@ -484,12 +574,15 @@ exit:
; Check that inline asms don't interfere if they don't clobber CC.
define void @f25(i32 %a, ptr %ptr) {
; CHECK-LABEL: f25:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: blhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB24_1: # %store
+; CHECK-NEXT: st %r2, 0(%r3)
+; CHECK-NEXT: br %r14
entry:
%add = add nsw i32 %a, 1000000
call void asm sideeffect "blah", "r"(i32 %add)
@@ -507,12 +600,15 @@ exit:
; ...but do interfere if they do clobber CC.
define void @f26(i32 %a, ptr %ptr) {
; CHECK-LABEL: f26:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: ciblh %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: ciblh %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB25_1: # %store
+; CHECK-NEXT: st %r2, 0(%r3)
+; CHECK-NEXT: br %r14
entry:
%add = add i32 %a, 1000000
call void asm sideeffect "blah", "r,~{cc}"(i32 %add)
@@ -531,11 +627,14 @@ exit:
; compare input.
define i32 @f27(i32 %a, i32 %b, ptr %dest1, ptr %dest2) {
; CHECK-LABEL: f27:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: sr %r3, %r2
-; CHECK-NEXT: st %r3, 0(%r4)
-; CHECK-NEXT: cibe %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: sr %r3, %r2
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: cibe %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB26_1: # %store
+; CHECK-NEXT: st %r3, 0(%r5)
+; CHECK-NEXT: br %r14
entry:
%add = add nsw i32 %a, 1000000
%sub = sub i32 %b, %add
@@ -554,9 +653,12 @@ exit:
; Make sure that we don't confuse a base register for a destination.
define void @f28(i64 %a, ptr %dest) {
; CHECK-LABEL: f28:
-; CHECK: xi 0(%r2), 15
-; CHECK: cgibe %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xi 0(%r2), 15
+; CHECK-NEXT: cgibe %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB27_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r3)
+; CHECK-NEXT: br %r14
entry:
%ptr = inttoptr i64 %a to ptr
%val = load i8, ptr %ptr
@@ -576,9 +678,12 @@ exit:
; Test that L gets converted to LT where useful.
define i32 @f29(i64 %base, i64 %index, ptr %dest) {
; CHECK-LABEL: f29:
-; CHECK: lt %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: bler %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lt %r2, 0(%r3,%r2)
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB28_1: # %store
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i64 %base, %index
%ptr = inttoptr i64 %add to ptr
@@ -597,9 +702,12 @@ exit:
; Test that LY gets converted to LT where useful.
define i32 @f30(i64 %base, i64 %index, ptr %dest) {
; CHECK-LABEL: f30:
-; CHECK: lt %r2, 100000({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: bler %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lt %r2, 100000(%r3,%r2)
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB29_1: # %store
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add1 = add i64 %base, %index
%add2 = add i64 %add1, 100000
@@ -619,9 +727,12 @@ exit:
; Test that LG gets converted to LTG where useful.
define i64 @f31(i64 %base, i64 %index, ptr %dest) {
; CHECK-LABEL: f31:
-; CHECK: ltg %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: bher %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltg %r2, 0(%r3,%r2)
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB30_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i64 %base, %index
%ptr = inttoptr i64 %add to ptr
@@ -640,9 +751,12 @@ exit:
; Test that LGF gets converted to LTGF where useful.
define i64 @f32(i64 %base, i64 %index, ptr %dest) {
; CHECK-LABEL: f32:
-; CHECK: ltgf %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgf %r2, 0(%r3,%r2)
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB31_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i64 %base, %index
%ptr = inttoptr i64 %add to ptr
@@ -662,12 +776,15 @@ exit:
; Test that LR gets converted to LTR where useful.
define i32 @f33(i32 %dummy, i32 %val, ptr %dest) {
; CHECK-LABEL: f33:
-; CHECK: ltr %r2, %r3
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB32_1: # %store
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
call void asm sideeffect "blah $0", "{r2}"(i32 %val)
%cmp = icmp slt i32 %val, 0
@@ -684,12 +801,15 @@ exit:
; Test that LGR gets converted to LTGR where useful.
define i64 @f34(i64 %dummy, i64 %val, ptr %dest) {
; CHECK-LABEL: f34:
-; CHECK: ltgr %r2, %r3
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB33_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
call void asm sideeffect "blah $0", "{r2}"(i64 %val)
%cmp = icmp sgt i64 %val, 0
@@ -706,12 +826,15 @@ exit:
; Test that LGFR gets converted to LTGFR where useful.
define i64 @f35(i64 %dummy, i32 %val, ptr %dest) {
; CHECK-LABEL: f35:
-; CHECK: ltgfr %r2, %r3
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgfr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB34_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%ext = sext i32 %val to i64
call void asm sideeffect "blah $0", "{r2}"(i64 %ext)
@@ -730,12 +853,15 @@ exit:
; we need.
define i32 @f36(i32 %val, i32 %dummy, ptr %dest) {
; CHECK-LABEL: f36:
-; CHECK: ltr %r3, %r2
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r3
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltr %r3, %r2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r3
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB35_1: # %store
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
call void asm sideeffect "blah $0", "{r3}"(i32 %val)
%cmp = icmp slt i32 %val, 0
@@ -753,12 +879,15 @@ exit:
; we need.
define i64 @f37(i64 %val, i64 %dummy, ptr %dest) {
; CHECK-LABEL: f37:
-; CHECK: ltgr %r3, %r2
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r3
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgr %r3, %r2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r3
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB36_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
call void asm sideeffect "blah $0", "{r3}"(i64 %val)
%cmp = icmp slt i64 %val, 0
@@ -776,12 +905,15 @@ exit:
; we need.
define i32 @f38(i32 %val, i64 %dummy, ptr %dest) {
; CHECK-LABEL: f38:
-; CHECK: ltgfr %r3, %r2
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r3
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgfr %r3, %r2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r3
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB37_1: # %store
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%ext = sext i32 %val to i64
call void asm sideeffect "blah $0", "{r3}"(i64 %ext)
@@ -799,12 +931,15 @@ exit:
; Test f35 for in-register extensions.
define i64 @f39(i64 %dummy, i64 %a, ptr %dest) {
; CHECK-LABEL: f39:
-; CHECK: ltgfr %r2, %r3
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgfr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB38_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%val = trunc i64 %a to i32
%ext = sext i32 %val to i64
@@ -823,12 +958,15 @@ exit:
; ...and again with what InstCombine would produce for f40.
define i64 @f40(i64 %dummy, i64 %a, ptr %dest) {
; CHECK-LABEL: f40:
-; CHECK: ltgfr %r2, %r3
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgfr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB39_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%shl = shl i64 %a, 32
%ext = ashr i64 %shl, 32
@@ -847,9 +985,12 @@ exit:
; Try a form of f7 in which the subtraction operands are compared directly.
define i32 @f41(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f41:
-; CHECK: s %r2, 0(%r4)
-; CHECK-NEXT: bner %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: s %r2, 0(%r4)
+; CHECK-NEXT: bner %r14
+; CHECK-NEXT: .LBB40_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%cur = load i32, ptr %dest
%res = sub i32 %a, %cur
@@ -867,9 +1008,12 @@ exit:
; A version of f32 that tests the unextended value.
define i64 @f42(i64 %base, i64 %index, ptr %dest) {
; CHECK-LABEL: f42:
-; CHECK: ltgf %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgf %r2, 0(%r3,%r2)
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB41_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i64 %base, %index
%ptr = inttoptr i64 %add to ptr
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-10.ll b/llvm/test/CodeGen/SystemZ/int-mul-10.ll
index 539a48622b8baa..3516d645f54707 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-10.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-10.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Test signed high-part i64->i128 multiplications on z14.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -asm-verbose=0 | FileCheck %s
@@ -6,10 +7,6 @@ declare i64 @foo()
; Check sign-extended multiplication in which only the high part is used.
define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
-; CHECK-LABEL: f1:
-; CHECK-NOT: {{%r[234]}}
-; CHECK: mgrk %r2, %r3, %r4
-; CHECK: br %r14
%ax = sext i64 %a to i128
%bx = sext i64 %b to i128
%mulx = mul i128 %ax, %bx
@@ -21,11 +18,6 @@ define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
; Check sign-extended multiplication in which only part of the high half
; is used.
define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
-; CHECK-LABEL: f2:
-; CHECK-NOT: {{%r[234]}}
-; CHECK: mgrk [[REG:%r[0-9]+]], %r3, %r4
-; CHECK: srlg %r2, [[REG]], 3
-; CHECK: br %r14
%ax = sext i64 %a to i128
%bx = sext i64 %b to i128
%mulx = mul i128 %ax, %bx
@@ -37,11 +29,6 @@ define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
; Check sign-extended multiplication in which the result is split into
; high and low halves.
define i64 @f3(i64 %dummy, i64 %a, i64 %b) {
-; CHECK-LABEL: f3:
-; CHECK-NOT: {{%r[234]}}
-; CHECK: mgrk %r2, %r3, %r4
-; CHECK: ogr %r2, %r3
-; CHECK: br %r14
%ax = sext i64 %a to i128
%bx = sext i64 %b to i128
%mulx = mul i128 %ax, %bx
@@ -54,10 +41,6 @@ define i64 @f3(i64 %dummy, i64 %a, i64 %b) {
; Check MG with no displacement.
define i64 @f4(i64 %dummy, i64 %a, ptr %src) {
-; CHECK-LABEL: f4:
-; CHECK-NOT: {{%r[234]}}
-; CHECK: mg %r2, 0(%r4)
-; CHECK: br %r14
%b = load i64, ptr %src
%ax = sext i64 %a to i128
%bx = sext i64 %b to i128
@@ -69,9 +52,6 @@ define i64 @f4(i64 %dummy, i64 %a, ptr %src) {
; Check the high end of the aligned MG range.
define i64 @f5(i64 %dummy, i64 %a, ptr %src) {
-; CHECK-LABEL: f5:
-; CHECK: mg %r2, 524280(%r4)
-; CHECK: br %r14
%ptr = getelementptr i64, ptr %src, i64 65535
%b = load i64, ptr %ptr
%ax = sext i64 %a to i128
@@ -85,10 +65,6 @@ define i64 @f5(i64 %dummy, i64 %a, ptr %src) {
; Check the next doubleword up, which requires separate address logic.
; Other sequences besides this one would be OK.
define i64 @f6(i64 %dummy, i64 %a, ptr %src) {
-; CHECK-LABEL: f6:
-; CHECK: agfi %r4, 524288
-; CHECK: mg %r2, 0(%r4)
-; CHECK: br %r14
%ptr = getelementptr i64, ptr %src, i64 65536
%b = load i64, ptr %ptr
%ax = sext i64 %a to i128
@@ -101,9 +77,6 @@ define i64 @f6(i64 %dummy, i64 %a, ptr %src) {
; Check the high end of the negative aligned MG range.
define i64 @f7(i64 %dummy, i64 %a, ptr %src) {
-; CHECK-LABEL: f7:
-; CHECK: mg %r2, -8(%r4)
-; CHECK: br %r14
%ptr = getelementptr i64, ptr %src, i64 -1
%b = load i64, ptr %ptr
%ax = sext i64 %a to i128
@@ -116,9 +89,6 @@ define i64 @f7(i64 %dummy, i64 %a, ptr %src) {
; Check the low end of the MG range.
define i64 @f8(i64 %dummy, i64 %a, ptr %src) {
-; CHECK-LABEL: f8:
-; CHECK: mg %r2, -524288(%r4)
-; CHECK: br %r14
%ptr = getelementptr i64, ptr %src, i64 -65536
%b = load i64, ptr %ptr
%ax = sext i64 %a to i128
@@ -132,10 +102,6 @@ define i64 @f8(i64 %dummy, i64 %a, ptr %src) {
; Check the next doubleword down, which needs separate address logic.
; Other sequences besides this one would be OK.
define i64 @f9(ptr %dest, i64 %a, ptr %src) {
-; CHECK-LABEL: f9:
-; CHECK: agfi %r4, -524296
-; CHECK: mg %r2, 0(%r4)
-; CHECK: br %r14
%ptr = getelementptr i64, ptr %src, i64 -65537
%b = load i64, ptr %ptr
%ax = sext i64 %a to i128
@@ -148,9 +114,6 @@ define i64 @f9(ptr %dest, i64 %a, ptr %src) {
; Check that MG allows an index.
define i64 @f10(ptr %dest, i64 %a, i64 %src, i64 %index) {
-; CHECK-LABEL: f10:
-; CHECK: mg %r2, 524287(%r5,%r4)
-; CHECK: br %r14
%add1 = add i64 %src, %index
%add2 = add i64 %add1, 524287
%ptr = inttoptr i64 %add2 to ptr
@@ -163,3 +126,5 @@ define i64 @f10(ptr %dest, i64 %a, i64 %src, i64 %index) {
ret i64 %high
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/SystemZ/int-neg-02.ll b/llvm/test/CodeGen/SystemZ/int-neg-02.ll
index 7f3f6375129aa7..7d62fe743a8b62 100644
--- a/llvm/test/CodeGen/SystemZ/int-neg-02.ll
+++ b/llvm/test/CodeGen/SystemZ/int-neg-02.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Test negative integer absolute.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
@@ -5,8 +6,9 @@
; Test i32->i32 negative absolute using slt.
define i32 @f1(i32 %val) {
; CHECK-LABEL: f1:
-; CHECK: lnr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lnr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp slt i32 %val, 0
%neg = sub i32 0, %val
%abs = select i1 %cmp, i32 %neg, i32 %val
@@ -17,8 +19,9 @@ define i32 @f1(i32 %val) {
; Test i32->i32 negative absolute using sle.
define i32 @f2(i32 %val) {
; CHECK-LABEL: f2:
-; CHECK: lnr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lnr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sle i32 %val, 0
%neg = sub i32 0, %val
%abs = select i1 %cmp, i32 %neg, i32 %val
@@ -29,8 +32,9 @@ define i32 @f2(i32 %val) {
; Test i32->i32 negative absolute using sgt.
define i32 @f3(i32 %val) {
; CHECK-LABEL: f3:
-; CHECK: lnr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lnr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sgt i32 %val, 0
%neg = sub i32 0, %val
%abs = select i1 %cmp, i32 %val, i32 %neg
@@ -41,8 +45,9 @@ define i32 @f3(i32 %val) {
; Test i32->i32 negative absolute using sge.
define i32 @f4(i32 %val) {
; CHECK-LABEL: f4:
-; CHECK: lnr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lnr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sge i32 %val, 0
%neg = sub i32 0, %val
%abs = select i1 %cmp, i32 %val, i32 %neg
@@ -53,8 +58,9 @@ define i32 @f4(i32 %val) {
; Test i32->i64 negative absolute.
define i64 @f5(i32 %val) {
; CHECK-LABEL: f5:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%ext = sext i32 %val to i64
%cmp = icmp slt i64 %ext, 0
%neg = sub i64 0, %ext
@@ -67,8 +73,9 @@ define i64 @f5(i32 %val) {
; sign extension.
define i64 @f6(i64 %val) {
; CHECK-LABEL: f6:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%trunc = trunc i64 %val to i32
%ext = sext i32 %trunc to i64
%cmp = icmp slt i64 %ext, 0
@@ -81,8 +88,9 @@ define i64 @f6(i64 %val) {
; Test i64 negative absolute.
define i64 @f7(i64 %val) {
; CHECK-LABEL: f7:
-; CHECK: lngr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp slt i64 %val, 0
%neg = sub i64 0, %val
%abs = select i1 %cmp, i64 %neg, i64 %val
@@ -93,8 +101,9 @@ define i64 @f7(i64 %val) {
; Test another form of f6, which is that produced by InstCombine.
define i64 @f8(i64 %val) {
; CHECK-LABEL: f8:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -107,8 +116,9 @@ define i64 @f8(i64 %val) {
; Try again with sle rather than slt.
define i64 @f9(i64 %val) {
; CHECK-LABEL: f9:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -121,8 +131,9 @@ define i64 @f9(i64 %val) {
; Repeat f8 with the operands reversed.
define i64 @f10(i64 %val) {
; CHECK-LABEL: f10:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -135,8 +146,9 @@ define i64 @f10(i64 %val) {
; Try again with sge rather than sgt.
define i64 @f11(i64 %val) {
; CHECK-LABEL: f11:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -149,8 +161,9 @@ define i64 @f11(i64 %val) {
; Repeat f8 with the negation coming from swapped operands.
define i64 @f12(i64 %val) {
; CHECK-LABEL: f12:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -162,8 +175,9 @@ define i64 @f12(i64 %val) {
; Likewise f9.
define i64 @f13(i64 %val) {
; CHECK-LABEL: f13:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -175,8 +189,9 @@ define i64 @f13(i64 %val) {
; Likewise f10.
define i64 @f14(i64 %val) {
; CHECK-LABEL: f14:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -188,8 +203,9 @@ define i64 @f14(i64 %val) {
; Likewise f11.
define i64 @f15(i64 %val) {
; CHECK-LABEL: f15:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -201,8 +217,9 @@ define i64 @f15(i64 %val) {
; Repeat f5 with the comparison on the unextended value.
define i64 @f16(i32 %val) {
; CHECK-LABEL: f16:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%ext = sext i32 %val to i64
%cmp = icmp slt i32 %val, 0
%neg = sub i64 0, %ext
@@ -214,8 +231,9 @@ define i64 @f16(i32 %val) {
; And again with the negation coming from swapped operands.
define i64 @f17(i32 %val) {
; CHECK-LABEL: f17:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%ext = sext i32 %val to i64
%cmp = icmp slt i32 %val, 0
%neg = sub i64 0, %ext
diff --git a/llvm/test/CodeGen/Thumb2/bfx.ll b/llvm/test/CodeGen/Thumb2/bfx.ll
index 9bd8d70275b924..0191b81805fd1d 100644
--- a/llvm/test/CodeGen/Thumb2/bfx.ll
+++ b/llvm/test/CodeGen/Thumb2/bfx.ll
@@ -1,8 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
define i32 @sbfx1(i32 %a) {
-; CHECK: sbfx1
-; CHECK: sbfx r0, r0, #7, #11
+; CHECK-LABEL: sbfx1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: sbfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = trunc i32 %t1 to i11
%t3 = sext i11 %t2 to i32
@@ -10,8 +13,10 @@ define i32 @sbfx1(i32 %a) {
}
define i32 @ubfx1(i32 %a) {
-; CHECK: ubfx1
-; CHECK: ubfx r0, r0, #7, #11
+; CHECK-LABEL: ubfx1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = trunc i32 %t1 to i11
%t3 = zext i11 %t2 to i32
@@ -19,8 +24,10 @@ define i32 @ubfx1(i32 %a) {
}
define i32 @ubfx2(i32 %a) {
-; CHECK: ubfx2
-; CHECK: ubfx r0, r0, #7, #11
+; CHECK-LABEL: ubfx2:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = and i32 %t1, 2047
ret i32 %t2
diff --git a/llvm/test/CodeGen/VE/Scalar/bitreverse.ll b/llvm/test/CodeGen/VE/Scalar/bitreverse.ll
index 208c207ff51392..e95f10e85de452 100644
--- a/llvm/test/CodeGen/VE/Scalar/bitreverse.ll
+++ b/llvm/test/CodeGen/VE/Scalar/bitreverse.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
declare i128 @llvm.bitreverse.i128(i128)
diff --git a/llvm/test/CodeGen/WebAssembly/conv.ll b/llvm/test/CodeGen/WebAssembly/conv.ll
index cf76548aad17d1..9c7da4b741929a 100644
--- a/llvm/test/CodeGen/WebAssembly/conv.ll
+++ b/llvm/test/CodeGen/WebAssembly/conv.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+nontrapping-fptoint | FileCheck %s
; Test that basic conversion operations assemble as expected.
@@ -354,3 +355,5 @@ define i16 @i16_trunc_sat_u_f64(double %x) {
%a = call i16 @llvm.fptoui.sat.i16.f64(double %x)
ret i16 %a
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll b/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll
index 45080d14dfd29b..a5e81064ae66fd 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mcpu=mvp -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128
; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals | FileCheck %s --check-prefixes CHECK,NO-SIMD128
@@ -135,3 +136,7 @@ define i64 @sext_inreg_i32_to_i64(<2 x i64> %x) {
%res = ashr i64 %a, 32
ret i64 %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
+; NO-SIMD128: {{.*}}
+; SIMD128: {{.*}}
diff --git a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
index 4ecb1bc31f2a80..e48618ba7a53d3 100644
--- a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
+++ b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
@@ -1,10 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64-ALL
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown --x86-lvi-load-no-cbranch < %s | FileCheck %s --check-prefix=X64
; RUN: llc -O0 -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --check-prefix=X64-NOOPT
; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 @test(ptr %secret, i32 %secret_size) #0 {
-; X64-LABEL: test:
+; X64-ALL-LABEL: test:
+; X64-ALL: # %bb.0: # %entry
+; X64-ALL-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: jmp .LBB0_1
+; X64-ALL-NEXT: .p2align 4, 0x90
+; X64-ALL-NEXT: .LBB0_4: # %if.end
+; X64-ALL-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-ALL-NEXT: incl -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: .LBB0_1: # %for.cond
+; X64-ALL-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-ALL-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: jge .LBB0_5
+; X64-ALL-NEXT: # %bb.2: # %for.body
+; X64-ALL-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-ALL-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: movl %eax, %ecx
+; X64-ALL-NEXT: shrl $31, %ecx
+; X64-ALL-NEXT: addl %eax, %ecx
+; X64-ALL-NEXT: andl $-2, %ecx
+; X64-ALL-NEXT: cmpl %ecx, %eax
+; X64-ALL-NEXT: jne .LBB0_4
+; X64-ALL-NEXT: # %bb.3: # %if.then
+; X64-ALL-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-ALL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: movq (%rax,%rcx,8), %rax
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: movl (%rax), %eax
+; X64-ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: jmp .LBB0_4
+; X64-ALL-NEXT: .LBB0_5: # %for.end
+; X64-ALL-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-ALL-NEXT: retq
+;
+; X64-NOOPT-LABEL: test:
+; X64-NOOPT: # %bb.0: # %entry
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: .LBB0_1: # %for.cond
+; X64-NOOPT-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: jge .LBB0_6
+; X64-NOOPT-NEXT: # %bb.2: # %for.body
+; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NOOPT-NEXT: movl $2, %ecx
+; X64-NOOPT-NEXT: cltd
+; X64-NOOPT-NEXT: idivl %ecx
+; X64-NOOPT-NEXT: cmpl $0, %edx
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: jne .LBB0_4
+; X64-NOOPT-NEXT: # %bb.3: # %if.then
+; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movq (%rax,%rcx,8), %rax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl (%rax), %eax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: .LBB0_4: # %if.end
+; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: jmp .LBB0_5
+; X64-NOOPT-NEXT: .LBB0_5: # %for.inc
+; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NOOPT-NEXT: addl $1, %eax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: jmp .LBB0_1
+; X64-NOOPT-NEXT: .LBB0_6: # %for.end
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NOOPT-NEXT: retq
entry:
%secret.addr = alloca ptr, align 8
%secret_size.addr = alloca i32, align 4
@@ -17,23 +118,7 @@ entry:
store i32 0, ptr %i, align 4
br label %for.cond
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: lfence
-; X64-NEXT: movl $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: jmp .LBB0_1
-
-; X64-NOOPT: # %bb.0: # %entry
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl $0, -{{[0-9]+}}(%rsp)
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+
for.cond: ; preds = %for.inc, %entry
%0 = load i32, ptr %i, align 4
@@ -41,22 +126,7 @@ for.cond: ; preds = %for.inc, %entry
%cmp = icmp slt i32 %0, %1
br i1 %cmp, label %for.body, label %for.end
-; X64: .LBB0_1: # %for.cond
-; X64-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-ALL-NEXT: lfence
-; X64-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
-; X64-ALL-NEXT: lfence
-; X64-NEXT: jge .LBB0_5
-
-; X64-NOOPT: .LBB0_1: # %for.cond
-; X64-NOOPT-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: jge .LBB0_6
+
for.body: ; preds = %for.cond
%2 = load i32, ptr %i, align 4
@@ -64,27 +134,7 @@ for.body: ; preds = %for.cond
%cmp1 = icmp eq i32 %rem, 0
br i1 %cmp1, label %if.then, label %if.end
-; X64: # %bb.2: # %for.body
-; X64-NEXT: # in Loop: Header=BB0_1 Depth=1
-; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-ALL-NEXT: lfence
-; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: shrl $31, %ecx
-; X64-NEXT: addl %eax, %ecx
-; X64-NEXT: andl $-2, %ecx
-; X64-NEXT: cmpl %ecx, %eax
-; X64-NEXT: jne .LBB0_4
-
-; X64-NOOPT: # %bb.2: # %for.body
-; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-NOOPT-NEXT: movl $2, %ecx
-; X64-NOOPT-NEXT: cltd
-; X64-NOOPT-NEXT: idivl %ecx
-; X64-NOOPT-NEXT: cmpl $0, %edx
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: jne .LBB0_4
+
if.then: ; preds = %for.body
%3 = load ptr, ptr %secret.addr, align 8
@@ -96,30 +146,7 @@ if.then: ; preds = %for.body
store i32 %6, ptr %ret_val, align 4
br label %if.end
-; X64: # %bb.3: # %if.then
-; X64-NEXT: # in Loop: Header=BB0_1 Depth=1
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT: lfence
-; X64-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
-; X64-NEXT: lfence
-; X64-NEXT: movq (%rax,%rcx,8), %rax
-; X64-NEXT: lfence
-; X64-NEXT: movl (%rax), %eax
-; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT: jmp .LBB0_4
-
-; X64-NOOPT: # %bb.3: # %if.then
-; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movq (%rax,%rcx,8), %rax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl (%rax), %eax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+
if.end: ; preds = %if.then, %for.body
br label %for.inc
@@ -130,15 +157,6 @@ for.inc: ; preds = %if.end
store i32 %inc, ptr %i, align 4
br label %for.cond
-; X64-NOOPT: .LBB0_5: # %for.inc
-; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-NOOPT-NEXT: addl $1, %eax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: jmp .LBB0_1
for.end: ; preds = %for.cond
%8 = load i32, ptr %ret_val, align 4
@@ -150,3 +168,5 @@ declare void @llvm.x86.sse2.lfence() #1
attributes #0 = { "target-features"="+lvi-load-hardening" }
attributes #1 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sext-subreg.ll b/llvm/test/CodeGen/X86/sext-subreg.ll
index 3e54f24d13affe..20451ff208cc05 100644
--- a/llvm/test/CodeGen/X86/sext-subreg.ll
+++ b/llvm/test/CodeGen/X86/sext-subreg.ll
@@ -1,16 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; rdar://7529457
define i64 @t(i64 %A, i64 %B, ptr %P, ptr%P2) nounwind {
; CHECK-LABEL: t:
-; CHECK: movslq %e{{.*}}, %rax
-; CHECK: movq %rax
-; CHECK: movl %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: addq %rsi, %rdi
+; CHECK-NEXT: movl %edi, (%rdx)
+; CHECK-NEXT: movslq %edi, %rax
+; CHECK-NEXT: movq %rax, (%rcx)
+; CHECK-NEXT: movl %eax, (%rdx)
+; CHECK-NEXT: retq
%C = add i64 %A, %B
%D = trunc i64 %C to i32
store volatile i32 %D, ptr %P
%E = shl i64 %C, 32
- %F = ashr i64 %E, 32
+ %F = ashr i64 %E, 32
store volatile i64 %F, ptr%P2
store volatile i32 %D, ptr %P
ret i64 undef
diff --git a/llvm/test/CodeGen/X86/x86-64-extend-shift.ll b/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
index 6ebaeee3669713..ae8d450d1345b9 100644
--- a/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
+++ b/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
@@ -1,8 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
; Formerly there were two shifts.
define i64 @baz(i32 %A) nounwind {
-; CHECK: shlq $49, %r
+; CHECK-LABEL: baz:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shlq $49, %rax
+; CHECK-NEXT: retq
%tmp1 = shl i32 %A, 17
%tmp2 = zext i32 %tmp1 to i64
%tmp3 = shl i64 %tmp2, 32
>From 2963c08192399286c138cb7e4d645e43e0476f02 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Mon, 6 May 2024 12:01:59 -0500
Subject: [PATCH 2/2] [DAGCombiner] Set shift flags during visit.
This is basically a direct port of what we have in InstCombine.
The goal is to reduce the need to essentially re-implement these
checks whenever one of the flags needs to be tested.
Leaving as draft as the diffs are pretty bad.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 70 +-
.../test/CodeGen/AArch64/DAGCombine_vscale.ll | 4 +-
.../AArch64/aarch64-address-type-promotion.ll | 12 +-
llvm/test/CodeGen/AArch64/addsub.ll | 2 +-
.../CodeGen/AArch64/arm64-narrow-st-merge.ll | 173 +-
llvm/test/CodeGen/AArch64/arm64-rev.ll | 45 +-
.../test/CodeGen/AArch64/arm64-trunc-store.ll | 8 +-
llvm/test/CodeGen/AArch64/bswap-known-bits.ll | 2 +-
.../AArch64/const-shift-of-constmasked.ll | 82 +-
.../CodeGen/AArch64/extract-subvec-combine.ll | 10 +-
.../fold-int-pow2-with-fmul-or-fdiv.ll | 3 +-
...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 2 +-
.../AArch64/pull-binop-through-shift.ll | 20 +-
llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 28 +-
llvm/test/CodeGen/AArch64/select_const.ll | 21 +-
llvm/test/CodeGen/AArch64/shift-logic.ll | 10 +-
llvm/test/CodeGen/AArch64/shift-mod.ll | 10 +-
.../AArch64/signed-truncation-check.ll | 2 +-
llvm/test/CodeGen/AArch64/srem-pow2.ll | 22 +-
.../AArch64/srem-seteq-illegal-types.ll | 14 +-
.../CodeGen/AArch64/srem-seteq-vec-splat.ll | 15 +-
llvm/test/CodeGen/AArch64/sshl_sat.ll | 19 +-
llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 28 +-
.../test/CodeGen/AArch64/storepairsuppress.ll | 16 +-
llvm/test/CodeGen/AArch64/tbl-loops.ll | 10 +-
llvm/test/CodeGen/AArch64/ushl_sat.ll | 11 +-
...vector_splat-const-shift-of-constmasked.ll | 25 +-
llvm/test/CodeGen/AArch64/win64_vararg.ll | 2 +-
llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 9 +-
llvm/test/CodeGen/AMDGPU/anyext.ll | 2 +
llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 91 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 26290 +++++++++-------
llvm/test/CodeGen/AMDGPU/bfi_nested.ll | 3 +-
llvm/test/CodeGen/AMDGPU/bswap.ll | 1 +
.../build-vector-packed-partial-undef.ll | 4 +
llvm/test/CodeGen/AMDGPU/build_vector.ll | 29 +-
llvm/test/CodeGen/AMDGPU/bypass-div.ll | 904 +-
.../CodeGen/AMDGPU/calling-conventions.ll | 3 +-
.../test/CodeGen/AMDGPU/carryout-selection.ll | 1085 +-
.../CodeGen/AMDGPU/combine-vload-extract.ll | 14 +-
llvm/test/CodeGen/AMDGPU/commute-shifts.ll | 4 +
.../CodeGen/AMDGPU/computeNumSignBits-mul.ll | 3 +-
llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll | 76 +-
llvm/test/CodeGen/AMDGPU/ctpop16.ll | 304 +-
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 117 +-
.../AMDGPU/divergence-driven-buildvector.ll | 9 +-
.../AMDGPU/divergence-driven-sext-inreg.ll | 4 +-
llvm/test/CodeGen/AMDGPU/ds_read2.ll | 17 +-
llvm/test/CodeGen/AMDGPU/extract-load-i1.ll | 28 +-
.../CodeGen/AMDGPU/extract-subvector-16bit.ll | 424 +-
llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 10 +-
.../fast-unaligned-load-store.private.ll | 8 +
.../test/CodeGen/AMDGPU/fmed3-cast-combine.ll | 3 +-
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 4 +
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 72 +-
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 15 +-
llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 56 +-
llvm/test/CodeGen/AMDGPU/fneg.ll | 8 +-
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 36 +-
llvm/test/CodeGen/AMDGPU/fp_to_sint.ll | 310 +-
llvm/test/CodeGen/AMDGPU/fp_to_uint.ll | 310 +-
.../AMDGPU/fp_trunc_store_fp64_to_bf16.ll | 642 +-
llvm/test/CodeGen/AMDGPU/fshr.ll | 4 +-
llvm/test/CodeGen/AMDGPU/function-returns.ll | 217 +-
llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll | 364 +-
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 142 +-
llvm/test/CodeGen/AMDGPU/idot4s.ll | 50 +-
llvm/test/CodeGen/AMDGPU/idot4u.ll | 26 +-
llvm/test/CodeGen/AMDGPU/idot8s.ll | 1253 +-
llvm/test/CodeGen/AMDGPU/idot8u.ll | 106 +-
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 45 +-
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 57 +-
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 345 +-
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 634 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 596 +-
.../AMDGPU/kernel-argument-dag-lowering.ll | 6 +-
.../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 58 +-
.../CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll | 3 +-
.../AMDGPU/llvm.amdgcn.raw.buffer.load.ll | 32 -
.../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll | 32 -
.../AMDGPU/llvm.amdgcn.struct.buffer.load.ll | 56 +-
.../llvm.amdgcn.struct.ptr.buffer.load.ll | 56 +-
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll | 12 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 17 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 17 +-
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 5 +-
llvm/test/CodeGen/AMDGPU/llvm.log.ll | 17 +-
llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 17 +-
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 5 +-
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 324 +-
.../AMDGPU/llvm.r600.read.local.size.ll | 54 +-
llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll | 146 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 5496 ++--
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 2332 +-
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 2716 +-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 4613 +--
llvm/test/CodeGen/AMDGPU/load-lo16.ll | 63 +-
.../AMDGPU/load-range-metadata-sign-bits.ll | 23 +-
llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll | 304 +-
llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll | 11 +-
llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 18 +-
llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 136 +-
llvm/test/CodeGen/AMDGPU/max.ll | 12 +-
llvm/test/CodeGen/AMDGPU/memory_clause.ll | 8 +-
llvm/test/CodeGen/AMDGPU/min.ll | 89 +-
llvm/test/CodeGen/AMDGPU/mul_int24.ll | 24 +-
llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 241 +-
llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll | 77 +-
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 229 +-
.../AMDGPU/reassoc-mul-add-1-to-mad.ll | 111 +-
llvm/test/CodeGen/AMDGPU/rem_i128.ll | 111 +-
llvm/test/CodeGen/AMDGPU/saddsat.ll | 9 +-
llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 48 +-
llvm/test/CodeGen/AMDGPU/scratch-simple.ll | 510 +-
llvm/test/CodeGen/AMDGPU/sdiv.ll | 12 +-
llvm/test/CodeGen/AMDGPU/sdiv64.ll | 757 +-
.../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 353 +-
llvm/test/CodeGen/AMDGPU/sext-in-reg.ll | 825 +-
llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll | 5 +-
.../test/CodeGen/AMDGPU/shl-add-to-add-shl.ll | 4 +-
llvm/test/CodeGen/AMDGPU/shl.ll | 64 +-
llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 8 +-
llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll | 12 -
.../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 134 +-
llvm/test/CodeGen/AMDGPU/sra.ll | 72 +-
.../AMDGPU/srem-seteq-illegal-types.ll | 12 +-
llvm/test/CodeGen/AMDGPU/srem64.ll | 847 +-
llvm/test/CodeGen/AMDGPU/ssubsat.ll | 9 +-
llvm/test/CodeGen/AMDGPU/store-private.ll | 1009 +-
llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 3 +-
llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 3 +-
llvm/test/CodeGen/AMDGPU/trunc-combine.ll | 2 +-
llvm/test/CodeGen/AMDGPU/uaddsat.ll | 1 +
llvm/test/CodeGen/AMDGPU/udiv.ll | 387 +-
llvm/test/CodeGen/AMDGPU/udiv64.ll | 798 +-
.../AMDGPU/unstructured-cfg-def-use-issue.ll | 156 +-
llvm/test/CodeGen/AMDGPU/urem64.ll | 505 +-
llvm/test/CodeGen/AMDGPU/usubsat.ll | 5 +-
.../CodeGen/AMDGPU/vector_shuffle.packed.ll | 431 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 336 +-
llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 25 +-
llvm/test/CodeGen/ARM/and-cmpz.ll | 8 +-
llvm/test/CodeGen/ARM/and-load-combine.ll | 12 +-
llvm/test/CodeGen/ARM/bfi-chain-cse-crash.ll | 16 +-
llvm/test/CodeGen/ARM/bfi.ll | 2 +
llvm/test/CodeGen/ARM/bfx.ll | 3 +-
llvm/test/CodeGen/ARM/combine-movc-sub.ll | 6 +-
llvm/test/CodeGen/ARM/demanded-bits-and.ll | 1 +
...st-and-by-const-from-lshr-in-eqcmp-zero.ll | 45 +-
...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 49 +-
.../CodeGen/ARM/illegal-bitfield-loadstore.ll | 51 +-
llvm/test/CodeGen/ARM/pr36577.ll | 11 +-
llvm/test/CodeGen/ARM/sadd_sat_plus.ll | 2 +
llvm/test/CodeGen/ARM/sbfx.ll | 2 +-
llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll | 4 +-
llvm/test/CodeGen/ARM/shift-combine.ll | 43 +-
.../CodeGen/ARM/simplifysetcc_narrow_load.ll | 6 +-
.../CodeGen/ARM/srem-seteq-illegal-types.ll | 94 +-
llvm/test/CodeGen/ARM/ssub_sat_plus.ll | 2 +
.../Hexagon/atomicrmw-uinc-udec-wrap.ll | 108 +-
.../Hexagon/isel-global-offset-alignment.ll | 6 +-
llvm/test/CodeGen/Hexagon/vect/vect-shifts.ll | 167 +-
.../LoongArch/atomicrmw-uinc-udec-wrap.ll | 36 +-
llvm/test/CodeGen/LoongArch/bstrins_w.ll | 1 +
llvm/test/CodeGen/LoongArch/bstrpick_d.ll | 6 +-
llvm/test/CodeGen/LoongArch/bstrpick_w.ll | 6 +-
llvm/test/CodeGen/LoongArch/bytepick.ll | 35 +-
.../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll | 19 +-
llvm/test/CodeGen/LoongArch/fcopysign.ll | 3 +
.../CodeGen/LoongArch/ir-instruction/and.ll | 5 +-
.../ir-instruction/atomic-cmpxchg.ll | 42 +-
.../ir-instruction/atomicrmw-minmax.ll | 460 +-
.../LoongArch/ir-instruction/atomicrmw.ll | 540 +-
.../CodeGen/LoongArch/ir-instruction/lshr.ll | 12 +-
llvm/test/CodeGen/LoongArch/legalicmpimm.ll | 5 +-
llvm/test/CodeGen/LoongArch/rotl-rotr.ll | 9 +-
llvm/test/CodeGen/LoongArch/sextw-removal.ll | 74 +-
llvm/test/CodeGen/Mips/atomic.ll | 28 +-
.../CodeGen/Mips/cconv/illegal-vectors.ll | 420 +-
llvm/test/CodeGen/Mips/cins.ll | 3 +-
llvm/test/CodeGen/Mips/dins.ll | 28 +-
llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll | 27 +-
llvm/test/CodeGen/Mips/fcopysign.ll | 46 +-
llvm/test/CodeGen/Mips/funnel-shift-rot.ll | 8 +-
llvm/test/CodeGen/Mips/funnel-shift.ll | 8 +-
llvm/test/CodeGen/Mips/llvm-ir/abs.ll | 118 +-
llvm/test/CodeGen/Mips/llvm-ir/nan-fp-attr.ll | 31 +-
.../CodeGen/Mips/load-store-left-right.ll | 78 +-
llvm/test/CodeGen/Mips/mips64-f128.ll | 17 +-
.../CodeGen/Mips/srem-seteq-illegal-types.ll | 32 +-
llvm/test/CodeGen/Mips/unalignedload.ll | 12 +-
llvm/test/CodeGen/NVPTX/atomics-sm70.ll | 8 +-
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 21 +-
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 74 +-
llvm/test/CodeGen/PowerPC/coalesce-ext.ll | 3 +-
.../CodeGen/PowerPC/more-dq-form-prepare.ll | 281 +-
.../no-ctr-loop-if-exit-in-nested-loop.ll | 29 +-
llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll | 38 +-
llvm/test/CodeGen/PowerPC/pr38087.ll | 5 +-
llvm/test/CodeGen/PowerPC/pr45432.ll | 4 +-
llvm/test/CodeGen/PowerPC/rlwinm.ll | 3 +-
.../test/CodeGen/PowerPC/sext-vector-inreg.ll | 5 +-
llvm/test/CodeGen/PowerPC/sms-phi-3.ll | 42 +-
.../PowerPC/srem-seteq-illegal-types.ll | 4 +-
llvm/test/CodeGen/PowerPC/vec-itofp.ll | 38 +-
.../PowerPC/vec_conv_i16_to_fp32_elts.ll | 49 +-
.../PowerPC/vec_conv_i16_to_fp64_elts.ll | 95 +-
.../PowerPC/vec_conv_i8_to_fp32_elts.ll | 27 +-
.../PowerPC/vec_conv_i8_to_fp64_elts.ll | 89 +-
.../vector-constrained-fp-intrinsics.ll | 11 +-
llvm/test/CodeGen/RISCV/add-before-shl.ll | 36 +-
.../RISCV/atomic-cmpxchg-branch-on-result.ll | 24 +-
llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll | 128 +
llvm/test/CodeGen/RISCV/atomic-rmw.ll | 2786 +-
llvm/test/CodeGen/RISCV/atomic-signext.ll | 228 +-
.../CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll | 56 +-
llvm/test/CodeGen/RISCV/bittest.ll | 14 +-
llvm/test/CodeGen/RISCV/bswap-bitreverse.ll | 48 +-
llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll | 12 +-
llvm/test/CodeGen/RISCV/div-by-constant.ll | 115 +-
llvm/test/CodeGen/RISCV/div.ll | 24 +-
llvm/test/CodeGen/RISCV/float-intrinsics.ll | 12 +-
llvm/test/CodeGen/RISCV/pr65025.ll | 3 +-
llvm/test/CodeGen/RISCV/rem.ll | 28 +-
.../CodeGen/RISCV/riscv-shifted-extend.ll | 30 +-
llvm/test/CodeGen/RISCV/rotl-rotr.ll | 6 +-
llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll | 9 +-
llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll | 14 +-
.../CodeGen/RISCV/rv64-legal-i32/rv64zba.ll | 49 +-
llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll | 27 +-
.../test/CodeGen/RISCV/rv64i-tricky-shifts.ll | 6 +-
llvm/test/CodeGen/RISCV/rv64xtheadbb.ll | 23 +-
llvm/test/CodeGen/RISCV/rv64zba.ll | 51 +-
llvm/test/CodeGen/RISCV/rv64zbb.ll | 27 +-
.../CodeGen/RISCV/rvv/extract-subvector.ll | 9 +-
.../CodeGen/RISCV/rvv/extractelt-int-rv32.ll | 78 +
.../CodeGen/RISCV/rvv/extractelt-int-rv64.ll | 78 +
.../RISCV/rvv/fixed-vectors-masked-gather.ll | 40 +-
.../rvv/fixed-vectors-reduction-int-vp.ll | 872 +-
.../CodeGen/RISCV/rvv/fixed-vectors-sad.ll | 26 -
.../CodeGen/RISCV/rvv/fixed-vectors-store.ll | 79 +-
.../RISCV/rvv/fixed-vectors-unaligned.ll | 40 +-
.../CodeGen/RISCV/rvv/insert-subvector.ll | 18 +-
.../CodeGen/RISCV/rvv/legalize-load-sdnode.ll | 7 +-
.../RISCV/rvv/legalize-store-sdnode.ll | 7 +-
llvm/test/CodeGen/RISCV/rvv/rvv-vscale.i64.ll | 15 +-
.../CodeGen/RISCV/rvv/vec3-setcc-crash.ll | 38 +-
.../CodeGen/RISCV/rvv/vreductions-int-vp.ll | 1289 +-
.../test/CodeGen/RISCV/rvv/vreductions-int.ll | 1338 +-
llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll | 22 +-
llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll | 48 +-
llvm/test/CodeGen/RISCV/shifts.ll | 120 +-
.../CodeGen/RISCV/split-udiv-by-constant.ll | 110 +-
.../CodeGen/RISCV/split-urem-by-constant.ll | 82 +-
llvm/test/CodeGen/RISCV/srem-lkk.ll | 28 +-
.../CodeGen/RISCV/srem-seteq-illegal-types.ll | 226 +-
llvm/test/CodeGen/RISCV/srem-vector-lkk.ll | 56 +-
.../CodeGen/RISCV/unaligned-load-store.ll | 78 +-
.../CodeGen/RISCV/urem-seteq-illegal-types.ll | 6 +-
...lar-shift-by-byte-multiple-legalization.ll | 2254 +-
.../RISCV/wide-scalar-shift-legalization.ll | 3103 +-
llvm/test/CodeGen/SystemZ/int-abs-01.ll | 24 +-
llvm/test/CodeGen/SystemZ/int-cmp-44.ll | 5 +-
llvm/test/CodeGen/SystemZ/int-neg-02.ll | 56 +-
.../SystemZ/store_nonbytesized_vecs.ll | 31 +-
llvm/test/CodeGen/Thumb/shift-and.ll | 25 +-
.../CodeGen/Thumb/srem-seteq-illegal-types.ll | 19 +-
llvm/test/CodeGen/Thumb/umul_fix_sat.ll | 8 +-
.../tail-pred-disabled-in-loloops.ll | 5 +-
llvm/test/CodeGen/Thumb2/bfx.ll | 3 +-
.../CodeGen/Thumb2/mve-float16regloops.ll | 4 +-
.../CodeGen/Thumb2/mve-float32regloops.ll | 6 +-
.../CodeGen/Thumb2/mve-gather-increment.ll | 294 +-
.../CodeGen/Thumb2/mve-gather-tailpred.ll | 22 +-
.../CodeGen/Thumb2/mve-scatter-increment.ll | 28 +-
llvm/test/CodeGen/Thumb2/shift_parts.ll | 14 +-
.../Thumb2/srem-seteq-illegal-types.ll | 15 +-
llvm/test/CodeGen/VE/Scalar/bitreverse.ll | 12 +-
llvm/test/CodeGen/WebAssembly/pr47375.ll | 8 +-
llvm/test/CodeGen/WebAssembly/simd-bitmask.ll | 26 +-
llvm/test/CodeGen/WebAssembly/simd-pr61780.ll | 4 +-
.../WebAssembly/simd-vecreduce-bool.ll | 638 +-
.../CodeGen/X86/2008-05-12-tailmerge-5.ll | 21 +-
.../X86/2009-05-23-dagcombine-shifts.ll | 6 +-
llvm/test/CodeGen/X86/add-ext.ll | 3 +-
llvm/test/CodeGen/X86/addr-mode-matcher-2.ll | 8 +-
llvm/test/CodeGen/X86/addr-mode-matcher-3.ll | 9 +-
llvm/test/CodeGen/X86/atomic-bit-test.ll | 35 +-
.../test/CodeGen/X86/atomic-rm-bit-test-64.ll | 15 +-
llvm/test/CodeGen/X86/atomic-rm-bit-test.ll | 72 +-
llvm/test/CodeGen/X86/avx512-calling-conv.ll | 426 +-
llvm/test/CodeGen/X86/bfloat.ll | 112 +-
llvm/test/CodeGen/X86/bitreverse.ll | 79 +-
llvm/test/CodeGen/X86/bool-math.ll | 8 +-
llvm/test/CodeGen/X86/bool-vector.ll | 12 -
llvm/test/CodeGen/X86/btc_bts_btr.ll | 32 +-
llvm/test/CodeGen/X86/buildvec-insertvec.ll | 7 +
llvm/test/CodeGen/X86/combine-rotates.ll | 3 +-
llvm/test/CodeGen/X86/combine-sext-in-reg.ll | 41 +-
llvm/test/CodeGen/X86/combine-shl.ll | 147 +-
llvm/test/CodeGen/X86/combine-srem.ll | 91 +-
llvm/test/CodeGen/X86/combine-srl.ll | 59 +-
.../CodeGen/X86/const-shift-of-constmasked.ll | 252 +-
llvm/test/CodeGen/X86/ctpop-mask.ll | 37 +-
llvm/test/CodeGen/X86/dagcombine-select.ll | 63 +-
llvm/test/CodeGen/X86/dagcombine-shifts.ll | 91 +-
llvm/test/CodeGen/X86/divide-by-constant.ll | 60 +-
llvm/test/CodeGen/X86/divmod128.ll | 96 +-
llvm/test/CodeGen/X86/extract-bits.ll | 247 +-
.../CodeGen/X86/field-extract-use-trunc.ll | 12 +-
llvm/test/CodeGen/X86/flt-rounds.ll | 40 +-
llvm/test/CodeGen/X86/fold-and-shift.ll | 3 +-
.../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 6 +-
...st-and-by-const-from-lshr-in-eqcmp-zero.ll | 2 +
...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 2 +
.../CodeGen/X86/illegal-bitfield-loadstore.ll | 11 +-
.../CodeGen/X86/insertelement-var-index.ll | 15 +-
llvm/test/CodeGen/X86/int-to-fp-demanded.ll | 3 +
llvm/test/CodeGen/X86/is_fpclass.ll | 8 +-
llvm/test/CodeGen/X86/known-pow2.ll | 2 +-
llvm/test/CodeGen/X86/lea-dagdag.ll | 8 +-
llvm/test/CodeGen/X86/lea.ll | 6 +-
llvm/test/CodeGen/X86/load-local-v3i129.ll | 51 +-
llvm/test/CodeGen/X86/load-local-v4i5.ll | 19 +-
llvm/test/CodeGen/X86/lvi-hardening-loads.ll | 3 +-
llvm/test/CodeGen/X86/movmsk-cmp.ll | 16 +-
llvm/test/CodeGen/X86/movmsk.ll | 16 +-
llvm/test/CodeGen/X86/packus.ll | 76 +-
llvm/test/CodeGen/X86/parity.ll | 12 +-
llvm/test/CodeGen/X86/pr22970.ll | 3 +-
llvm/test/CodeGen/X86/pr29170.ll | 4 +-
llvm/test/CodeGen/X86/pr32420.ll | 6 +-
llvm/test/CodeGen/X86/pr32588.ll | 18 +-
llvm/test/CodeGen/X86/pr45995.ll | 27 +-
llvm/test/CodeGen/X86/pr61923.ll | 3 +-
llvm/test/CodeGen/X86/pr77459.ll | 8 +-
.../CodeGen/X86/pull-binop-through-shift.ll | 38 +-
llvm/test/CodeGen/X86/rem.ll | 3 +-
llvm/test/CodeGen/X86/sadd_sat_vec.ll | 91 +-
llvm/test/CodeGen/X86/sdiv_fix.ll | 69 +-
llvm/test/CodeGen/X86/sdiv_fix_sat.ll | 35 +-
llvm/test/CodeGen/X86/select.ll | 20 +-
llvm/test/CodeGen/X86/select_const.ll | 4 +-
llvm/test/CodeGen/X86/selectcc-to-shiftand.ll | 13 +-
llvm/test/CodeGen/X86/setcc.ll | 39 +-
llvm/test/CodeGen/X86/sext-subreg.ll | 6 +-
llvm/test/CodeGen/X86/shift-and.ll | 4 +-
llvm/test/CodeGen/X86/shift-combine.ll | 132 +-
llvm/test/CodeGen/X86/shift-folding.ll | 6 +-
llvm/test/CodeGen/X86/shift-i128.ll | 37 +-
llvm/test/CodeGen/X86/shift-mask.ll | 310 +-
llvm/test/CodeGen/X86/shift-pair.ll | 5 +-
llvm/test/CodeGen/X86/shrink-compare-pgso.ll | 2 +
llvm/test/CodeGen/X86/shrink-compare.ll | 2 +
llvm/test/CodeGen/X86/smul_fix_sat.ll | 4 +
llvm/test/CodeGen/X86/split-store.ll | 2 +-
llvm/test/CodeGen/X86/srem-lkk.ll | 6 +-
.../CodeGen/X86/srem-seteq-illegal-types.ll | 10 +-
llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll | 102 +-
llvm/test/CodeGen/X86/srem-seteq.ll | 12 +-
llvm/test/CodeGen/X86/srem-vector-lkk.ll | 124 +-
llvm/test/CodeGen/X86/sse2-vector-shifts.ll | 9 +-
llvm/test/CodeGen/X86/sshl_sat.ll | 32 +-
llvm/test/CodeGen/X86/ssub_sat_vec.ll | 89 +-
llvm/test/CodeGen/X86/sttni.ll | 28 +-
.../subvectorwise-store-of-vector-splat.ll | 484 +-
llvm/test/CodeGen/X86/test-vs-bittest.ll | 2 +-
llvm/test/CodeGen/X86/udiv_fix.ll | 34 +-
llvm/test/CodeGen/X86/udiv_fix_sat.ll | 62 +-
llvm/test/CodeGen/X86/umul-with-overflow.ll | 7 +-
llvm/test/CodeGen/X86/umul_fix_sat.ll | 2 +
.../CodeGen/X86/urem-seteq-illegal-types.ll | 6 +-
llvm/test/CodeGen/X86/ushl_sat.ll | 10 +-
llvm/test/CodeGen/X86/vec_shift5.ll | 2 +-
llvm/test/CodeGen/X86/vector-bitreverse.ll | 34 +-
llvm/test/CodeGen/X86/vector-sext.ll | 2 +
.../vector-shuffle-combining-avx512bwvl.ll | 4 +-
...vector_splat-const-shift-of-constmasked.ll | 210 +-
llvm/test/CodeGen/X86/vselect.ll | 12 +-
llvm/test/CodeGen/X86/x86-64-extend-shift.ll | 3 +-
llvm/test/CodeGen/X86/zext-shl.ll | 6 +-
381 files changed, 48813 insertions(+), 34569 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 05ab6e2e48206f..a182fbdde4f17b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9738,6 +9738,64 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
return SDValue();
}
+static SDValue setShiftFlags(SelectionDAG &DAG, const SDLoc &DL, SDNode *N) {
+ unsigned Opc = N->getOpcode();
+ assert((Opc == ISD::SHL || Opc == ISD::SRA || Opc == ISD::SRL) &&
+ "Unknown shift opcode");
+ SDNodeFlags Flags = N->getFlags();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ // Check if we already have the flags.
+ if (Opc == ISD::SHL) {
+ if (Flags.hasNoSignedWrap() && Flags.hasNoUnsignedWrap())
+ return SDValue();
+
+ } else {
+ if (Flags.hasExact())
+ return SDValue();
+
+ // shr (shl X, Y), Y
+ if (sd_match(N0, m_Shl(m_Value(), m_Specific(N1)))) {
+ Flags.setExact(true);
+ return DAG.getNode(Opc, DL, VT, N0, N1, Flags);
+ }
+ }
+
+ // Compute what we know about shift count.
+ KnownBits KnownCnt = DAG.computeKnownBits(N1);
+ // Compute what we know about shift amt.
+ KnownBits KnownAmt = DAG.computeKnownBits(N0);
+ APInt MaxCnt = KnownCnt.getMaxValue();
+ bool Changed = false;
+ if (Opc == ISD::SHL) {
+ // If we have as many leading zeros than maximum shift cnt we have nuw.
+ if (!Flags.hasNoUnsignedWrap() &&
+ MaxCnt.ule(KnownAmt.countMinLeadingZeros())) {
+ Flags.setNoUnsignedWrap(true);
+ Changed = true;
+ }
+ // If we have more sign bits than maximum shift cnt we have nsw.
+ if (!Flags.hasNoSignedWrap()) {
+ if (MaxCnt.ult(KnownAmt.countMinSignBits()) ||
+ MaxCnt.ult(DAG.ComputeNumSignBits(N0))) {
+ Flags.setNoSignedWrap(true);
+ Changed = true;
+ }
+ }
+ } else {
+ // If we have at least as many trailing zeros as maximum count then we have
+ // exact.
+ Changed = MaxCnt.ule(KnownAmt.countMinTrailingZeros());
+ Flags.setExact(Changed);
+ }
+
+ if (Changed)
+ return DAG.getNode(Opc, DL, VT, N0, N1, Flags);
+ return SDValue();
+}
+
SDValue DAGCombiner::visitSHL(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -9745,6 +9803,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
return V;
SDLoc DL(N);
+ if (SDValue V = setShiftFlags(DAG, DL, N))
+ return V;
+
EVT VT = N0.getValueType();
EVT ShiftVT = N1.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
@@ -9895,7 +9956,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
LHSC.getZExtValue() <= RHSC.getZExtValue();
};
-
+
// fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
// fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2
if (N0->getFlags().hasExact()) {
@@ -10188,6 +10249,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
return V;
SDLoc DL(N);
+ if (SDValue V = setShiftFlags(DAG, DL, N))
+ return V;
+
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
@@ -10389,6 +10453,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
return V;
SDLoc DL(N);
+ if (SDValue V = setShiftFlags(DAG, DL, N))
+ return V;
EVT VT = N0.getValueType();
EVT ShiftVT = N1.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
@@ -10638,6 +10704,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
return SDValue();
}
+
+
SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
diff --git a/llvm/test/CodeGen/AArch64/DAGCombine_vscale.ll b/llvm/test/CodeGen/AArch64/DAGCombine_vscale.ll
index 71f4da2b465c13..26f41f4d98c5cc 100644
--- a/llvm/test/CodeGen/AArch64/DAGCombine_vscale.ll
+++ b/llvm/test/CodeGen/AArch64/DAGCombine_vscale.ll
@@ -7,8 +7,8 @@
define <vscale x 4 x i32> @sext_inreg(<vscale x 4 x i32> %a) {
; CHECK-LABEL: sext_inreg:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: sxth z0.s, p0/m, z0.s
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: asr z0.s, z0.s, #16
; CHECK-NEXT: ret
%in = insertelement <vscale x 4 x i32> undef, i32 16, i32 0
%splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll b/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
index e14618251b6d7d..da29a480959394 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
@@ -8,10 +8,14 @@ target triple = "arm64-apple-macosx10.9"
define void @foo(ptr nocapture %a, i32 %i) {
; CHECK-LABEL: foo:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: add x8, x0, w1, sxtw #2
-; CHECK-NEXT: ldp w9, w10, [x8, #4]
-; CHECK-NEXT: add w9, w10, w9
-; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: add x9, x8, #1
+; CHECK-NEXT: add x8, x8, #2
+; CHECK-NEXT: ldr w9, [x0, x9, lsl #2]
+; CHECK-NEXT: ldr w8, [x0, x8, lsl #2]
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: str w8, [x0, w1, sxtw #2]
; CHECK-NEXT: ret
entry:
%add = add nsw i32 %i, 1
diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll
index 20215fe9146924..fed1747c23e1c9 100644
--- a/llvm/test/CodeGen/AArch64/addsub.ll
+++ b/llvm/test/CodeGen/AArch64/addsub.ll
@@ -376,7 +376,7 @@ define i1 @uadd_add(i8 %a, i8 %b, ptr %p) {
; CHECK-NEXT: mov w8, #255 // =0xff
; CHECK-NEXT: bic w8, w8, w0
; CHECK-NEXT: add w8, w8, w1, uxtb
-; CHECK-NEXT: lsr w0, w8, #8
+; CHECK-NEXT: ubfx w0, w8, #8, #1
; CHECK-NEXT: add w8, w8, #1
; CHECK-NEXT: strb w8, [x2]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll b/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll
index 01ad14b6fba52a..033ac301d7abe1 100644
--- a/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll
@@ -11,15 +11,19 @@ define void @Strh_zero(ptr nocapture %P, i32 %n) {
; CHECK-LABEL: Strh_zero:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: sbfiz x8, x1, #1, #32
-; CHECK-NEXT: str wzr, [x0, x8]
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: strh wzr, [x0, w1, sxtw #1]
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: strh wzr, [x0, x8, lsl #1]
; CHECK-NEXT: ret
;
; CHECK-STRICT-LABEL: Strh_zero:
; CHECK-STRICT: // %bb.0: // %entry
-; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
-; CHECK-STRICT-NEXT: strh wzr, [x8]
-; CHECK-STRICT-NEXT: strh wzr, [x8, #2]
+; CHECK-STRICT-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-STRICT-NEXT: sxtw x8, w1
+; CHECK-STRICT-NEXT: strh wzr, [x0, w1, sxtw #1]
+; CHECK-STRICT-NEXT: add x8, x8, #1
+; CHECK-STRICT-NEXT: strh wzr, [x0, x8, lsl #1]
; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
@@ -43,17 +47,27 @@ define void @Strh_zero_4(ptr nocapture %P, i32 %n) {
; CHECK-LABEL: Strh_zero_4:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: sbfiz x8, x1, #1, #32
-; CHECK-NEXT: str xzr, [x0, x8]
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: strh wzr, [x0, w1, sxtw #1]
+; CHECK-NEXT: add x9, x8, #1
+; CHECK-NEXT: add x10, x8, #2
+; CHECK-NEXT: add x8, x8, #3
+; CHECK-NEXT: strh wzr, [x0, x9, lsl #1]
+; CHECK-NEXT: strh wzr, [x0, x10, lsl #1]
+; CHECK-NEXT: strh wzr, [x0, x8, lsl #1]
; CHECK-NEXT: ret
;
; CHECK-STRICT-LABEL: Strh_zero_4:
; CHECK-STRICT: // %bb.0: // %entry
-; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
-; CHECK-STRICT-NEXT: strh wzr, [x8]
-; CHECK-STRICT-NEXT: strh wzr, [x8, #2]
-; CHECK-STRICT-NEXT: strh wzr, [x8, #4]
-; CHECK-STRICT-NEXT: strh wzr, [x8, #6]
+; CHECK-STRICT-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-STRICT-NEXT: sxtw x8, w1
+; CHECK-STRICT-NEXT: strh wzr, [x0, w1, sxtw #1]
+; CHECK-STRICT-NEXT: add x9, x8, #1
+; CHECK-STRICT-NEXT: add x10, x8, #2
+; CHECK-STRICT-NEXT: add x8, x8, #3
+; CHECK-STRICT-NEXT: strh wzr, [x0, x9, lsl #1]
+; CHECK-STRICT-NEXT: strh wzr, [x0, x10, lsl #1]
+; CHECK-STRICT-NEXT: strh wzr, [x0, x8, lsl #1]
; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
@@ -82,14 +96,19 @@ define void @Strw_zero(ptr nocapture %P, i32 %n) {
; CHECK-LABEL: Strw_zero:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: sbfiz x8, x1, #2, #32
-; CHECK-NEXT: str xzr, [x0, x8]
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: str wzr, [x0, w1, sxtw #2]
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: str wzr, [x0, x8, lsl #2]
; CHECK-NEXT: ret
;
; CHECK-STRICT-LABEL: Strw_zero:
; CHECK-STRICT: // %bb.0: // %entry
-; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
-; CHECK-STRICT-NEXT: stp wzr, wzr, [x8]
+; CHECK-STRICT-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-STRICT-NEXT: sxtw x8, w1
+; CHECK-STRICT-NEXT: str wzr, [x0, w1, sxtw #2]
+; CHECK-STRICT-NEXT: add x8, x8, #1
+; CHECK-STRICT-NEXT: str wzr, [x0, x8, lsl #2]
; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
@@ -107,14 +126,20 @@ entry:
define void @Strw_zero_nonzero(ptr nocapture %P, i32 %n) {
; CHECK-LABEL: Strw_zero_nonzero:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x0, w1, sxtw #2
-; CHECK-NEXT: stp wzr, w1, [x8]
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: str wzr, [x0, w1, sxtw #2]
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: str w1, [x0, x8, lsl #2]
; CHECK-NEXT: ret
;
; CHECK-STRICT-LABEL: Strw_zero_nonzero:
; CHECK-STRICT: // %bb.0: // %entry
-; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
-; CHECK-STRICT-NEXT: stp wzr, w1, [x8]
+; CHECK-STRICT-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-STRICT-NEXT: sxtw x8, w1
+; CHECK-STRICT-NEXT: str wzr, [x0, w1, sxtw #2]
+; CHECK-STRICT-NEXT: add x8, x8, #1
+; CHECK-STRICT-NEXT: str w1, [x0, x8, lsl #2]
; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
@@ -135,15 +160,28 @@ entry:
define void @Strw_zero_4(ptr nocapture %P, i32 %n) {
; CHECK-LABEL: Strw_zero_4:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x0, w1, sxtw #2
-; CHECK-NEXT: stp xzr, xzr, [x8]
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: str wzr, [x0, w1, sxtw #2]
+; CHECK-NEXT: add x9, x8, #1
+; CHECK-NEXT: add x10, x8, #2
+; CHECK-NEXT: add x8, x8, #3
+; CHECK-NEXT: str wzr, [x0, x9, lsl #2]
+; CHECK-NEXT: str wzr, [x0, x10, lsl #2]
+; CHECK-NEXT: str wzr, [x0, x8, lsl #2]
; CHECK-NEXT: ret
;
; CHECK-STRICT-LABEL: Strw_zero_4:
; CHECK-STRICT: // %bb.0: // %entry
-; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
-; CHECK-STRICT-NEXT: stp wzr, wzr, [x8]
-; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #8]
+; CHECK-STRICT-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-STRICT-NEXT: sxtw x8, w1
+; CHECK-STRICT-NEXT: str wzr, [x0, w1, sxtw #2]
+; CHECK-STRICT-NEXT: add x9, x8, #1
+; CHECK-STRICT-NEXT: add x10, x8, #2
+; CHECK-STRICT-NEXT: add x8, x8, #3
+; CHECK-STRICT-NEXT: str wzr, [x0, x9, lsl #2]
+; CHECK-STRICT-NEXT: str wzr, [x0, x10, lsl #2]
+; CHECK-STRICT-NEXT: str wzr, [x0, x8, lsl #2]
; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
@@ -202,15 +240,22 @@ entry:
define void @Sturh_zero(ptr nocapture %P, i32 %n) {
; CHECK-LABEL: Sturh_zero:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x0, w1, sxtw #1
-; CHECK-NEXT: stur wzr, [x8, #-6]
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: sub x9, x8, #2
+; CHECK-NEXT: sub x8, x8, #3
+; CHECK-NEXT: strh wzr, [x0, x9, lsl #1]
+; CHECK-NEXT: strh wzr, [x0, x8, lsl #1]
; CHECK-NEXT: ret
;
; CHECK-STRICT-LABEL: Sturh_zero:
; CHECK-STRICT: // %bb.0: // %entry
-; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
-; CHECK-STRICT-NEXT: sturh wzr, [x8, #-4]
-; CHECK-STRICT-NEXT: sturh wzr, [x8, #-6]
+; CHECK-STRICT-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-STRICT-NEXT: sxtw x8, w1
+; CHECK-STRICT-NEXT: sub x9, x8, #2
+; CHECK-STRICT-NEXT: sub x8, x8, #3
+; CHECK-STRICT-NEXT: strh wzr, [x0, x9, lsl #1]
+; CHECK-STRICT-NEXT: strh wzr, [x0, x8, lsl #1]
; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -2
@@ -234,17 +279,30 @@ entry:
define void @Sturh_zero_4(ptr nocapture %P, i32 %n) {
; CHECK-LABEL: Sturh_zero_4:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x0, w1, sxtw #1
-; CHECK-NEXT: stur xzr, [x8, #-8]
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: sub x9, x8, #3
+; CHECK-NEXT: sub x10, x8, #4
+; CHECK-NEXT: strh wzr, [x0, x9, lsl #1]
+; CHECK-NEXT: sub x9, x8, #2
+; CHECK-NEXT: sub x8, x8, #1
+; CHECK-NEXT: strh wzr, [x0, x10, lsl #1]
+; CHECK-NEXT: strh wzr, [x0, x9, lsl #1]
+; CHECK-NEXT: strh wzr, [x0, x8, lsl #1]
; CHECK-NEXT: ret
;
; CHECK-STRICT-LABEL: Sturh_zero_4:
; CHECK-STRICT: // %bb.0: // %entry
-; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
-; CHECK-STRICT-NEXT: sturh wzr, [x8, #-6]
-; CHECK-STRICT-NEXT: sturh wzr, [x8, #-8]
-; CHECK-STRICT-NEXT: sturh wzr, [x8, #-4]
-; CHECK-STRICT-NEXT: sturh wzr, [x8, #-2]
+; CHECK-STRICT-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-STRICT-NEXT: sxtw x8, w1
+; CHECK-STRICT-NEXT: sub x9, x8, #3
+; CHECK-STRICT-NEXT: sub x10, x8, #4
+; CHECK-STRICT-NEXT: strh wzr, [x0, x9, lsl #1]
+; CHECK-STRICT-NEXT: sub x9, x8, #2
+; CHECK-STRICT-NEXT: sub x8, x8, #1
+; CHECK-STRICT-NEXT: strh wzr, [x0, x10, lsl #1]
+; CHECK-STRICT-NEXT: strh wzr, [x0, x9, lsl #1]
+; CHECK-STRICT-NEXT: strh wzr, [x0, x8, lsl #1]
; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -3
@@ -273,14 +331,22 @@ entry:
define void @Sturw_zero(ptr nocapture %P, i32 %n) {
; CHECK-LABEL: Sturw_zero:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x0, w1, sxtw #2
-; CHECK-NEXT: stur xzr, [x8, #-16]
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: sub x9, x8, #3
+; CHECK-NEXT: sub x8, x8, #4
+; CHECK-NEXT: str wzr, [x0, x9, lsl #2]
+; CHECK-NEXT: str wzr, [x0, x8, lsl #2]
; CHECK-NEXT: ret
;
; CHECK-STRICT-LABEL: Sturw_zero:
; CHECK-STRICT: // %bb.0: // %entry
-; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
-; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #-16]
+; CHECK-STRICT-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-STRICT-NEXT: sxtw x8, w1
+; CHECK-STRICT-NEXT: sub x9, x8, #3
+; CHECK-STRICT-NEXT: sub x8, x8, #4
+; CHECK-STRICT-NEXT: str wzr, [x0, x9, lsl #2]
+; CHECK-STRICT-NEXT: str wzr, [x0, x8, lsl #2]
; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -3
@@ -302,15 +368,30 @@ entry:
define void @Sturw_zero_4(ptr nocapture %P, i32 %n) {
; CHECK-LABEL: Sturw_zero_4:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x0, w1, sxtw #2
-; CHECK-NEXT: stp xzr, xzr, [x8, #-16]
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: sub x9, x8, #3
+; CHECK-NEXT: sub x10, x8, #4
+; CHECK-NEXT: str wzr, [x0, x9, lsl #2]
+; CHECK-NEXT: sub x9, x8, #2
+; CHECK-NEXT: sub x8, x8, #1
+; CHECK-NEXT: str wzr, [x0, x10, lsl #2]
+; CHECK-NEXT: str wzr, [x0, x9, lsl #2]
+; CHECK-NEXT: str wzr, [x0, x8, lsl #2]
; CHECK-NEXT: ret
;
; CHECK-STRICT-LABEL: Sturw_zero_4:
; CHECK-STRICT: // %bb.0: // %entry
-; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
-; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #-16]
-; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #-8]
+; CHECK-STRICT-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-STRICT-NEXT: sxtw x8, w1
+; CHECK-STRICT-NEXT: sub x9, x8, #3
+; CHECK-STRICT-NEXT: sub x10, x8, #4
+; CHECK-STRICT-NEXT: str wzr, [x0, x9, lsl #2]
+; CHECK-STRICT-NEXT: sub x9, x8, #2
+; CHECK-STRICT-NEXT: sub x8, x8, #1
+; CHECK-STRICT-NEXT: str wzr, [x0, x10, lsl #2]
+; CHECK-STRICT-NEXT: str wzr, [x0, x9, lsl #2]
+; CHECK-STRICT-NEXT: str wzr, [x0, x8, lsl #2]
; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -3
diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll
index f548a0e01feee6..19c0c8940b92b3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rev.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll
@@ -27,8 +27,8 @@ entry:
define i32 @test_rev_w_srl16(i16 %a) {
; CHECK-SD-LABEL: test_rev_w_srl16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: rev w8, w0
-; CHECK-SD-NEXT: lsr w0, w8, #16
+; CHECK-SD-NEXT: and w8, w0, #0xffff
+; CHECK-SD-NEXT: rev16 w0, w8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_rev_w_srl16:
@@ -45,12 +45,18 @@ entry:
}
define i32 @test_rev_w_srl16_load(ptr %a) {
-; CHECK-LABEL: test_rev_w_srl16_load:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: rev w8, w8
-; CHECK-NEXT: lsr w0, w8, #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_rev_w_srl16_load:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldrh w8, [x0]
+; CHECK-SD-NEXT: rev16 w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_rev_w_srl16_load:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldrh w8, [x0]
+; CHECK-GI-NEXT: rev w8, w8
+; CHECK-GI-NEXT: lsr w0, w8, #16
+; CHECK-GI-NEXT: ret
entry:
%0 = load i16, ptr %a
%1 = zext i16 %0 to i32
@@ -88,9 +94,8 @@ entry:
define i64 @test_rev_x_srl32(i32 %a) {
; CHECK-SD-LABEL: test_rev_x_srl32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: rev x8, x0
-; CHECK-SD-NEXT: lsr x0, x8, #32
+; CHECK-SD-NEXT: mov w8, w0
+; CHECK-SD-NEXT: rev32 x0, x8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_rev_x_srl32:
@@ -107,12 +112,18 @@ entry:
}
define i64 @test_rev_x_srl32_load(ptr %a) {
-; CHECK-LABEL: test_rev_x_srl32_load:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr w8, [x0]
-; CHECK-NEXT: rev x8, x8
-; CHECK-NEXT: lsr x0, x8, #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_rev_x_srl32_load:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr w8, [x0]
+; CHECK-SD-NEXT: rev32 x0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_rev_x_srl32_load:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: rev x8, x8
+; CHECK-GI-NEXT: lsr x0, x8, #32
+; CHECK-GI-NEXT: ret
entry:
%0 = load i32, ptr %a
%1 = zext i32 %0 to i64
diff --git a/llvm/test/CodeGen/AArch64/arm64-trunc-store.ll b/llvm/test/CodeGen/AArch64/arm64-trunc-store.ll
index cd47fff46729f9..31a649ad64f448 100644
--- a/llvm/test/CodeGen/AArch64/arm64-trunc-store.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-trunc-store.ll
@@ -20,10 +20,10 @@ define void @fct32(i32 %arg, i64 %var) {
; CHECK-LABEL: fct32:
; CHECK: // %bb.0: // %bb
; CHECK-NEXT: adrp x8, :got:zptr32
+; CHECK-NEXT: sub w9, w0, #1
; CHECK-NEXT: ldr x8, [x8, :got_lo12:zptr32]
; CHECK-NEXT: ldr x8, [x8]
-; CHECK-NEXT: add x8, x8, w0, sxtw #2
-; CHECK-NEXT: stur w1, [x8, #-4]
+; CHECK-NEXT: str w1, [x8, w9, sxtw #2]
; CHECK-NEXT: ret
bb:
%.pre37 = load ptr, ptr @zptr32, align 8
@@ -39,10 +39,10 @@ define void @fct16(i32 %arg, i64 %var) {
; CHECK-LABEL: fct16:
; CHECK: // %bb.0: // %bb
; CHECK-NEXT: adrp x8, :got:zptr16
+; CHECK-NEXT: sub w9, w0, #1
; CHECK-NEXT: ldr x8, [x8, :got_lo12:zptr16]
; CHECK-NEXT: ldr x8, [x8]
-; CHECK-NEXT: add x8, x8, w0, sxtw #1
-; CHECK-NEXT: sturh w1, [x8, #-2]
+; CHECK-NEXT: strh w1, [x8, w9, sxtw #1]
; CHECK-NEXT: ret
bb:
%.pre37 = load ptr, ptr @zptr16, align 8
diff --git a/llvm/test/CodeGen/AArch64/bswap-known-bits.ll b/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
index f13ef52f94a414..9ce7ff72c02900 100644
--- a/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
+++ b/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
@@ -88,7 +88,7 @@ define i32 @demand_one_byte2(i32 %x) {
define i64 @demand_one_byte3(i64 %x) {
; CHECK-LABEL: demand_one_byte3:
; CHECK: ; %bb.0:
-; CHECK-NEXT: and x0, x0, #0xff
+; CHECK-NEXT: ubfx x0, x0, #0, #8
; CHECK-NEXT: ret
%b = call i64 @llvm.bswap.i64(i64 %x)
%r = lshr i64 %b, 56
diff --git a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
index 66a6745cda8f76..d1e468b70fb9a7 100644
--- a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
@@ -101,7 +101,8 @@ define i8 @test_i8_224_mask_lshr_6(i8 %a0) {
define i8 @test_i8_7_mask_ashr_1(i8 %a0) {
; CHECK-LABEL: test_i8_7_mask_ashr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #1, #2
+; CHECK-NEXT: and w8, w0, #0x6
+; CHECK-NEXT: asr w0, w8, #1
; CHECK-NEXT: ret
%t0 = and i8 %a0, 7
%t1 = ashr i8 %t0, 1
@@ -112,7 +113,7 @@ define i8 @test_i8_28_mask_ashr_1(i8 %a0) {
; CHECK-LABEL: test_i8_28_mask_ashr_1:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0x1c
-; CHECK-NEXT: lsr w0, w8, #1
+; CHECK-NEXT: asr w0, w8, #1
; CHECK-NEXT: ret
%t0 = and i8 %a0, 28
%t1 = ashr i8 %t0, 1
@@ -121,7 +122,8 @@ define i8 @test_i8_28_mask_ashr_1(i8 %a0) {
define i8 @test_i8_28_mask_ashr_2(i8 %a0) {
; CHECK-LABEL: test_i8_28_mask_ashr_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #2, #3
+; CHECK-NEXT: and w8, w0, #0x1c
+; CHECK-NEXT: asr w0, w8, #2
; CHECK-NEXT: ret
%t0 = and i8 %a0, 28
%t1 = ashr i8 %t0, 2
@@ -130,7 +132,8 @@ define i8 @test_i8_28_mask_ashr_2(i8 %a0) {
define i8 @test_i8_28_mask_ashr_3(i8 %a0) {
; CHECK-LABEL: test_i8_28_mask_ashr_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #3, #2
+; CHECK-NEXT: and w8, w0, #0x18
+; CHECK-NEXT: asr w0, w8, #3
; CHECK-NEXT: ret
%t0 = and i8 %a0, 28
%t1 = ashr i8 %t0, 3
@@ -139,7 +142,8 @@ define i8 @test_i8_28_mask_ashr_3(i8 %a0) {
define i8 @test_i8_28_mask_ashr_4(i8 %a0) {
; CHECK-LABEL: test_i8_28_mask_ashr_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #4, #1
+; CHECK-NEXT: and w8, w0, #0x10
+; CHECK-NEXT: asr w0, w8, #4
; CHECK-NEXT: ret
%t0 = and i8 %a0, 28
%t1 = ashr i8 %t0, 4
@@ -169,7 +173,8 @@ define i8 @test_i8_224_mask_ashr_4(i8 %a0) {
define i8 @test_i8_224_mask_ashr_5(i8 %a0) {
; CHECK-LABEL: test_i8_224_mask_ashr_5:
; CHECK: // %bb.0:
-; CHECK-NEXT: sbfx w0, w0, #5, #3
+; CHECK-NEXT: and w8, w0, #0xe0
+; CHECK-NEXT: sbfx w0, w8, #5, #3
; CHECK-NEXT: ret
%t0 = and i8 %a0, 224
%t1 = ashr i8 %t0, 5
@@ -210,7 +215,8 @@ define i8 @test_i8_7_mask_shl_4(i8 %a0) {
define i8 @test_i8_7_mask_shl_5(i8 %a0) {
; CHECK-LABEL: test_i8_7_mask_shl_5:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsl w0, w0, #5
+; CHECK-NEXT: and w8, w0, #0x7
+; CHECK-NEXT: lsl w0, w8, #5
; CHECK-NEXT: ret
%t0 = and i8 %a0, 7
%t1 = shl i8 %t0, 5
@@ -376,7 +382,8 @@ define i16 @test_i16_65024_mask_lshr_10(i16 %a0) {
define i16 @test_i16_127_mask_ashr_1(i16 %a0) {
; CHECK-LABEL: test_i16_127_mask_ashr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #1, #6
+; CHECK-NEXT: and w8, w0, #0x7e
+; CHECK-NEXT: asr w0, w8, #1
; CHECK-NEXT: ret
%t0 = and i16 %a0, 127
%t1 = ashr i16 %t0, 1
@@ -387,7 +394,7 @@ define i16 @test_i16_2032_mask_ashr_3(i16 %a0) {
; CHECK-LABEL: test_i16_2032_mask_ashr_3:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0x7f0
-; CHECK-NEXT: lsr w0, w8, #3
+; CHECK-NEXT: asr w0, w8, #3
; CHECK-NEXT: ret
%t0 = and i16 %a0, 2032
%t1 = ashr i16 %t0, 3
@@ -396,7 +403,8 @@ define i16 @test_i16_2032_mask_ashr_3(i16 %a0) {
define i16 @test_i16_2032_mask_ashr_4(i16 %a0) {
; CHECK-LABEL: test_i16_2032_mask_ashr_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #4, #7
+; CHECK-NEXT: and w8, w0, #0x7f0
+; CHECK-NEXT: asr w0, w8, #4
; CHECK-NEXT: ret
%t0 = and i16 %a0, 2032
%t1 = ashr i16 %t0, 4
@@ -405,7 +413,8 @@ define i16 @test_i16_2032_mask_ashr_4(i16 %a0) {
define i16 @test_i16_2032_mask_ashr_5(i16 %a0) {
; CHECK-LABEL: test_i16_2032_mask_ashr_5:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #5, #6
+; CHECK-NEXT: and w8, w0, #0x7e0
+; CHECK-NEXT: asr w0, w8, #5
; CHECK-NEXT: ret
%t0 = and i16 %a0, 2032
%t1 = ashr i16 %t0, 5
@@ -414,7 +423,8 @@ define i16 @test_i16_2032_mask_ashr_5(i16 %a0) {
define i16 @test_i16_2032_mask_ashr_6(i16 %a0) {
; CHECK-LABEL: test_i16_2032_mask_ashr_6:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #6, #5
+; CHECK-NEXT: and w8, w0, #0x7c0
+; CHECK-NEXT: asr w0, w8, #6
; CHECK-NEXT: ret
%t0 = and i16 %a0, 2032
%t1 = ashr i16 %t0, 6
@@ -444,7 +454,8 @@ define i16 @test_i16_65024_mask_ashr_8(i16 %a0) {
define i16 @test_i16_65024_mask_ashr_9(i16 %a0) {
; CHECK-LABEL: test_i16_65024_mask_ashr_9:
; CHECK: // %bb.0:
-; CHECK-NEXT: sbfx w0, w0, #9, #7
+; CHECK-NEXT: and w8, w0, #0xfe00
+; CHECK-NEXT: sbfx w0, w8, #9, #7
; CHECK-NEXT: ret
%t0 = and i16 %a0, 65024
%t1 = ashr i16 %t0, 9
@@ -485,7 +496,8 @@ define i16 @test_i16_127_mask_shl_8(i16 %a0) {
define i16 @test_i16_127_mask_shl_9(i16 %a0) {
; CHECK-LABEL: test_i16_127_mask_shl_9:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsl w0, w0, #9
+; CHECK-NEXT: and w8, w0, #0x7f
+; CHECK-NEXT: lsl w0, w8, #9
; CHECK-NEXT: ret
%t0 = and i16 %a0, 127
%t1 = shl i16 %t0, 9
@@ -651,7 +663,8 @@ define i32 @test_i32_4294836224_mask_lshr_18(i32 %a0) {
define i32 @test_i32_32767_mask_ashr_1(i32 %a0) {
; CHECK-LABEL: test_i32_32767_mask_ashr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #1, #14
+; CHECK-NEXT: and w8, w0, #0x7ffe
+; CHECK-NEXT: asr w0, w8, #1
; CHECK-NEXT: ret
%t0 = and i32 %a0, 32767
%t1 = ashr i32 %t0, 1
@@ -662,7 +675,7 @@ define i32 @test_i32_8388352_mask_ashr_7(i32 %a0) {
; CHECK-LABEL: test_i32_8388352_mask_ashr_7:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0x7fff00
-; CHECK-NEXT: lsr w0, w8, #7
+; CHECK-NEXT: asr w0, w8, #7
; CHECK-NEXT: ret
%t0 = and i32 %a0, 8388352
%t1 = ashr i32 %t0, 7
@@ -671,7 +684,8 @@ define i32 @test_i32_8388352_mask_ashr_7(i32 %a0) {
define i32 @test_i32_8388352_mask_ashr_8(i32 %a0) {
; CHECK-LABEL: test_i32_8388352_mask_ashr_8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #8, #15
+; CHECK-NEXT: and w8, w0, #0x7fff00
+; CHECK-NEXT: asr w0, w8, #8
; CHECK-NEXT: ret
%t0 = and i32 %a0, 8388352
%t1 = ashr i32 %t0, 8
@@ -680,7 +694,8 @@ define i32 @test_i32_8388352_mask_ashr_8(i32 %a0) {
define i32 @test_i32_8388352_mask_ashr_9(i32 %a0) {
; CHECK-LABEL: test_i32_8388352_mask_ashr_9:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #9, #14
+; CHECK-NEXT: and w8, w0, #0x7ffe00
+; CHECK-NEXT: asr w0, w8, #9
; CHECK-NEXT: ret
%t0 = and i32 %a0, 8388352
%t1 = ashr i32 %t0, 9
@@ -689,7 +704,8 @@ define i32 @test_i32_8388352_mask_ashr_9(i32 %a0) {
define i32 @test_i32_8388352_mask_ashr_10(i32 %a0) {
; CHECK-LABEL: test_i32_8388352_mask_ashr_10:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w0, w0, #10, #13
+; CHECK-NEXT: and w8, w0, #0x7ffc00
+; CHECK-NEXT: asr w0, w8, #10
; CHECK-NEXT: ret
%t0 = and i32 %a0, 8388352
%t1 = ashr i32 %t0, 10
@@ -719,7 +735,8 @@ define i32 @test_i32_4294836224_mask_ashr_16(i32 %a0) {
define i32 @test_i32_4294836224_mask_ashr_17(i32 %a0) {
; CHECK-LABEL: test_i32_4294836224_mask_ashr_17:
; CHECK: // %bb.0:
-; CHECK-NEXT: asr w0, w0, #17
+; CHECK-NEXT: and w8, w0, #0xfffe0000
+; CHECK-NEXT: asr w0, w8, #17
; CHECK-NEXT: ret
%t0 = and i32 %a0, 4294836224
%t1 = ashr i32 %t0, 17
@@ -760,7 +777,8 @@ define i32 @test_i32_32767_mask_shl_16(i32 %a0) {
define i32 @test_i32_32767_mask_shl_17(i32 %a0) {
; CHECK-LABEL: test_i32_32767_mask_shl_17:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsl w0, w0, #17
+; CHECK-NEXT: and w8, w0, #0x7fff
+; CHECK-NEXT: lsl w0, w8, #17
; CHECK-NEXT: ret
%t0 = and i32 %a0, 32767
%t1 = shl i32 %t0, 17
@@ -926,7 +944,8 @@ define i64 @test_i64_18446744065119617024_mask_lshr_34(i64 %a0) {
define i64 @test_i64_2147483647_mask_ashr_1(i64 %a0) {
; CHECK-LABEL: test_i64_2147483647_mask_ashr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx x0, x0, #1, #30
+; CHECK-NEXT: and x8, x0, #0x7ffffffe
+; CHECK-NEXT: asr x0, x8, #1
; CHECK-NEXT: ret
%t0 = and i64 %a0, 2147483647
%t1 = ashr i64 %t0, 1
@@ -937,7 +956,7 @@ define i64 @test_i64_140737488289792_mask_ashr_15(i64 %a0) {
; CHECK-LABEL: test_i64_140737488289792_mask_ashr_15:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0x7fffffff0000
-; CHECK-NEXT: lsr x0, x8, #15
+; CHECK-NEXT: asr x0, x8, #15
; CHECK-NEXT: ret
%t0 = and i64 %a0, 140737488289792
%t1 = ashr i64 %t0, 15
@@ -946,7 +965,8 @@ define i64 @test_i64_140737488289792_mask_ashr_15(i64 %a0) {
define i64 @test_i64_140737488289792_mask_ashr_16(i64 %a0) {
; CHECK-LABEL: test_i64_140737488289792_mask_ashr_16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx x0, x0, #16, #31
+; CHECK-NEXT: and x8, x0, #0x7fffffff0000
+; CHECK-NEXT: asr x0, x8, #16
; CHECK-NEXT: ret
%t0 = and i64 %a0, 140737488289792
%t1 = ashr i64 %t0, 16
@@ -955,7 +975,8 @@ define i64 @test_i64_140737488289792_mask_ashr_16(i64 %a0) {
define i64 @test_i64_140737488289792_mask_ashr_17(i64 %a0) {
; CHECK-LABEL: test_i64_140737488289792_mask_ashr_17:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx x0, x0, #17, #30
+; CHECK-NEXT: and x8, x0, #0x7ffffffe0000
+; CHECK-NEXT: asr x0, x8, #17
; CHECK-NEXT: ret
%t0 = and i64 %a0, 140737488289792
%t1 = ashr i64 %t0, 17
@@ -964,7 +985,8 @@ define i64 @test_i64_140737488289792_mask_ashr_17(i64 %a0) {
define i64 @test_i64_140737488289792_mask_ashr_18(i64 %a0) {
; CHECK-LABEL: test_i64_140737488289792_mask_ashr_18:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx x0, x0, #18, #29
+; CHECK-NEXT: and x8, x0, #0x7ffffffc0000
+; CHECK-NEXT: asr x0, x8, #18
; CHECK-NEXT: ret
%t0 = and i64 %a0, 140737488289792
%t1 = ashr i64 %t0, 18
@@ -994,7 +1016,8 @@ define i64 @test_i64_18446744065119617024_mask_ashr_32(i64 %a0) {
define i64 @test_i64_18446744065119617024_mask_ashr_33(i64 %a0) {
; CHECK-LABEL: test_i64_18446744065119617024_mask_ashr_33:
; CHECK: // %bb.0:
-; CHECK-NEXT: asr x0, x0, #33
+; CHECK-NEXT: and x8, x0, #0xfffffffe00000000
+; CHECK-NEXT: asr x0, x8, #33
; CHECK-NEXT: ret
%t0 = and i64 %a0, 18446744065119617024
%t1 = ashr i64 %t0, 33
@@ -1025,7 +1048,7 @@ define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) {
define i64 @test_i64_2147483647_mask_shl_32(i64 %a0) {
; CHECK-LABEL: test_i64_2147483647_mask_shl_32:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0x7fffffff
+; CHECK-NEXT: and x8, x0, #0x7fffffff
; CHECK-NEXT: lsl x0, x8, #32
; CHECK-NEXT: ret
%t0 = and i64 %a0, 2147483647
@@ -1035,7 +1058,8 @@ define i64 @test_i64_2147483647_mask_shl_32(i64 %a0) {
define i64 @test_i64_2147483647_mask_shl_33(i64 %a0) {
; CHECK-LABEL: test_i64_2147483647_mask_shl_33:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsl x0, x0, #33
+; CHECK-NEXT: and x8, x0, #0x7fffffff
+; CHECK-NEXT: lsl x0, x8, #33
; CHECK-NEXT: ret
%t0 = and i64 %a0, 2147483647
%t1 = shl i64 %t0, 33
diff --git a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
index 307974c012a9e4..c07ce1b72bf798 100644
--- a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
+++ b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
@@ -54,8 +54,9 @@ define <4 x i16> @and_extract_sext_idx4(<8 x i8> %vec) nounwind {
define <2 x i32> @sext_extract_zext_idx0(<4 x i16> %vec) nounwind {
; CHECK-LABEL: sext_extract_zext_idx0:
; CHECK: // %bb.0:
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: shl v0.2s, v0.2s, #16
+; CHECK-NEXT: sshr v0.2s, v0.2s, #16
; CHECK-NEXT: ret
%zext = zext <4 x i16> %vec to <4 x i32>
%extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0)
@@ -95,9 +96,10 @@ define <4 x i16> @sext_extract_sext_idx0(<8 x i8> %vec) nounwind {
define <2 x i32> @sext_extract_zext_idx2(<4 x i16> %vec) nounwind {
; CHECK-LABEL: sext_extract_zext_idx2:
; CHECK: // %bb.0:
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: shl v0.2s, v0.2s, #16
+; CHECK-NEXT: sshr v0.2s, v0.2s, #16
; CHECK-NEXT: ret
%zext = zext <4 x i16> %vec to <4 x i32>
%extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 2)
diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
index a78addc490086d..75bb722aff5e28 100644
--- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -525,8 +525,9 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; CHECK-LABEL: fdiv_pow_shl_cnt:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-1115684864 // =0xbd800000
; CHECK-NEXT: and w9, w0, #0x1f
+; CHECK-NEXT: mov w8, #-1090519040 // =0xbf000000
+; CHECK-NEXT: add w9, w9, #3
; CHECK-NEXT: sub w8, w8, w9, lsl #23
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 60ceaf19731921..d5fcde659b1e34 100644
--- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -274,7 +274,7 @@ define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
; CHECK-NEXT: and w8, w0, #0xff
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: lsr w8, w8, w1
-; CHECK-NEXT: lsr w0, w8, #7
+; CHECK-NEXT: ubfx w0, w8, #7, #1
; CHECK-NEXT: ret
%t0 = shl i8 128, %y
%t1 = and i8 %t0, %x
diff --git a/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll b/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll
index a892bb85692d3e..14b7a1f915b8e6 100644
--- a/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll
+++ b/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll
@@ -31,8 +31,8 @@ define i32 @and_nosignbit_shl(i32 %x, ptr %dst) {
define i32 @or_signbit_shl(i32 %x, ptr %dst) {
; CHECK-LABEL: or_signbit_shl:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsl w8, w0, #8
-; CHECK-NEXT: orr w0, w8, #0xff000000
+; CHECK-NEXT: orr w8, w0, #0xffff0000
+; CHECK-NEXT: lsl w0, w8, #8
; CHECK-NEXT: str w0, [x1]
; CHECK-NEXT: ret
%t0 = or i32 %x, 4294901760 ; 0xFFFF0000
@@ -108,8 +108,8 @@ define i32 @add_nosignbit_shl(i32 %x, ptr %dst) {
define i32 @and_signbit_lshr(i32 %x, ptr %dst) {
; CHECK-LABEL: and_signbit_lshr:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsr w8, w0, #8
-; CHECK-NEXT: and w0, w8, #0xffff00
+; CHECK-NEXT: and w8, w0, #0xffff0000
+; CHECK-NEXT: lsr w0, w8, #8
; CHECK-NEXT: str w0, [x1]
; CHECK-NEXT: ret
%t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -120,8 +120,8 @@ define i32 @and_signbit_lshr(i32 %x, ptr %dst) {
define i32 @and_nosignbit_lshr(i32 %x, ptr %dst) {
; CHECK-LABEL: and_nosignbit_lshr:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsr w8, w0, #8
-; CHECK-NEXT: and w0, w8, #0x7fff00
+; CHECK-NEXT: and w8, w0, #0x7fff0000
+; CHECK-NEXT: lsr w0, w8, #8
; CHECK-NEXT: str w0, [x1]
; CHECK-NEXT: ret
%t0 = and i32 %x, 2147418112 ; 0x7FFF0000
@@ -211,8 +211,8 @@ define i32 @add_nosignbit_lshr(i32 %x, ptr %dst) {
define i32 @and_signbit_ashr(i32 %x, ptr %dst) {
; CHECK-LABEL: and_signbit_ashr:
; CHECK: // %bb.0:
-; CHECK-NEXT: asr w8, w0, #8
-; CHECK-NEXT: and w0, w8, #0xffffff00
+; CHECK-NEXT: and w8, w0, #0xffff0000
+; CHECK-NEXT: asr w0, w8, #8
; CHECK-NEXT: str w0, [x1]
; CHECK-NEXT: ret
%t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -223,8 +223,8 @@ define i32 @and_signbit_ashr(i32 %x, ptr %dst) {
define i32 @and_nosignbit_ashr(i32 %x, ptr %dst) {
; CHECK-LABEL: and_nosignbit_ashr:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsr w8, w0, #8
-; CHECK-NEXT: and w0, w8, #0x7fff00
+; CHECK-NEXT: and w8, w0, #0x7fff0000
+; CHECK-NEXT: asr w0, w8, #8
; CHECK-NEXT: str w0, [x1]
; CHECK-NEXT: ret
%t0 = and i32 %x, 2147418112 ; 0x7FFF0000
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 8a0e7661883f21..aa4226ac124cd2 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -191,12 +191,14 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-LABEL: v2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: add x9, x1, #1
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-NEXT: ldrsb w8, [x0]
+; CHECK-NEXT: ldrsb w9, [x1]
+; CHECK-NEXT: ldrsb w10, [x0, #1]
+; CHECK-NEXT: ldrsb w11, [x1, #1]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: mov v0.s[1], w10
+; CHECK-NEXT: mov v1.s[1], w11
; CHECK-NEXT: shl v1.2s, v1.2s, #24
; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s
@@ -231,12 +233,14 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldrsh w8, [x0]
+; CHECK-SD-NEXT: ldrsh w9, [x1]
+; CHECK-SD-NEXT: ldrsh w10, [x0, #2]
+; CHECK-SD-NEXT: ldrsh w11, [x1, #2]
+; CHECK-SD-NEXT: fmov s0, w8
+; CHECK-SD-NEXT: fmov s1, w9
+; CHECK-SD-NEXT: mov v0.s[1], w10
+; CHECK-SD-NEXT: mov v1.s[1], w11
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16
; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/select_const.ll b/llvm/test/CodeGen/AArch64/select_const.ll
index cd50d776e913f1..0ff3e42e77da92 100644
--- a/llvm/test/CodeGen/AArch64/select_const.ll
+++ b/llvm/test/CodeGen/AArch64/select_const.ll
@@ -436,10 +436,11 @@ define i8 @sel_constants_shl_constant(i1 %cond) {
define i8 @shl_constant_sel_constants(i1 %cond) {
; CHECK-LABEL: shl_constant_sel_constants:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8 // =0x8
+; CHECK-NEXT: mov w8, #2 // =0x2
; CHECK-NEXT: tst w0, #0x1
-; CHECK-NEXT: mov w9, #4 // =0x4
-; CHECK-NEXT: csel w0, w9, w8, ne
+; CHECK-NEXT: mov w9, #1 // =0x1
+; CHECK-NEXT: cinc x8, x8, eq
+; CHECK-NEXT: lsl w0, w9, w8
; CHECK-NEXT: ret
%sel = select i1 %cond, i8 2, i8 3
%bo = shl i8 1, %sel
@@ -461,10 +462,11 @@ define i8 @sel_constants_lshr_constant(i1 %cond) {
define i8 @lshr_constant_sel_constants(i1 %cond) {
; CHECK-LABEL: lshr_constant_sel_constants:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8 // =0x8
+; CHECK-NEXT: mov w8, #2 // =0x2
; CHECK-NEXT: tst w0, #0x1
-; CHECK-NEXT: mov w9, #16 // =0x10
-; CHECK-NEXT: csel w0, w9, w8, ne
+; CHECK-NEXT: mov w9, #64 // =0x40
+; CHECK-NEXT: cinc x8, x8, eq
+; CHECK-NEXT: lsr w0, w9, w8
; CHECK-NEXT: ret
%sel = select i1 %cond, i8 2, i8 3
%bo = lshr i8 64, %sel
@@ -485,10 +487,11 @@ define i8 @sel_constants_ashr_constant(i1 %cond) {
define i8 @ashr_constant_sel_constants(i1 %cond) {
; CHECK-LABEL: ashr_constant_sel_constants:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-16 // =0xfffffff0
+; CHECK-NEXT: mov w8, #2 // =0x2
; CHECK-NEXT: tst w0, #0x1
-; CHECK-NEXT: mov w9, #-32 // =0xffffffe0
-; CHECK-NEXT: csel w0, w9, w8, ne
+; CHECK-NEXT: mov w9, #-128 // =0xffffff80
+; CHECK-NEXT: cinc x8, x8, eq
+; CHECK-NEXT: asr w0, w9, w8
; CHECK-NEXT: ret
%sel = select i1 %cond, i8 2, i8 3
%bo = ashr i8 128, %sel
diff --git a/llvm/test/CodeGen/AArch64/shift-logic.ll b/llvm/test/CodeGen/AArch64/shift-logic.ll
index b1ad31d1475ced..5d3f8bde106bbe 100644
--- a/llvm/test/CodeGen/AArch64/shift-logic.ll
+++ b/llvm/test/CodeGen/AArch64/shift-logic.ll
@@ -233,7 +233,8 @@ define i32 @lshr_or_extra_use(i32 %x, i32 %y, ptr %p) nounwind {
define i64 @desirable_to_commute1(i64 %x) {
; CHECK-LABEL: desirable_to_commute1:
; CHECK: // %bb.0:
-; CHECK-NEXT: and x0, x0, #0x7fff8
+; CHECK-NEXT: ubfx x8, x0, #3, #16
+; CHECK-NEXT: lsl x0, x8, #3
; CHECK-NEXT: ret
;
; CHECK-GISEL-LABEL: desirable_to_commute1:
@@ -250,8 +251,8 @@ define i64 @desirable_to_commute1(i64 %x) {
define i64 @desirable_to_commute2(ptr %p, i64 %i) {
; CHECK-LABEL: desirable_to_commute2:
; CHECK: // %bb.0:
-; CHECK-NEXT: and x8, x1, #0x1ff8
-; CHECK-NEXT: ldr x0, [x0, x8]
+; CHECK-NEXT: ubfx x8, x1, #3, #10
+; CHECK-NEXT: ldr x0, [x0, x8, lsl #3]
; CHECK-NEXT: ret
;
; CHECK-GISEL-LABEL: desirable_to_commute2:
@@ -272,7 +273,8 @@ define i64 @desirable_to_commute2(ptr %p, i64 %i) {
define void @apint_type_mismatch(i16 %a, ptr %p) {
; CHECK-LABEL: apint_type_mismatch:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: and w8, w0, #0x7f8
+; CHECK-NEXT: ubfx w8, w0, #3, #8
+; CHECK-NEXT: lsl w8, w8, #3
; CHECK-NEXT: str w8, [x1]
; CHECK-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/shift-mod.ll b/llvm/test/CodeGen/AArch64/shift-mod.ll
index ac95b75168ed98..f36efb52788507 100644
--- a/llvm/test/CodeGen/AArch64/shift-mod.ll
+++ b/llvm/test/CodeGen/AArch64/shift-mod.ll
@@ -78,8 +78,9 @@ entry:
define i64 @ashr_add_shl_i32(i64 %r) {
; CHECK-LABEL: ashr_add_shl_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add w8, w0, #1
-; CHECK-NEXT: sxtw x0, w8
+; CHECK-NEXT: mov x8, #4294967296 // =0x100000000
+; CHECK-NEXT: add x8, x8, x0, lsl #32
+; CHECK-NEXT: asr x0, x8, #32
; CHECK-NEXT: ret
%conv = shl i64 %r, 32
%sext = add i64 %conv, 4294967296
@@ -90,8 +91,9 @@ define i64 @ashr_add_shl_i32(i64 %r) {
define i64 @ashr_add_shl_i8(i64 %r) {
; CHECK-LABEL: ashr_add_shl_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: add w8, w0, #1
-; CHECK-NEXT: sxtb x0, w8
+; CHECK-NEXT: mov x8, #72057594037927936 // =0x100000000000000
+; CHECK-NEXT: add x8, x8, x0, lsl #56
+; CHECK-NEXT: asr x0, x8, #56
; CHECK-NEXT: ret
%conv = shl i64 %r, 56
%sext = add i64 %conv, 72057594037927936
diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
index bb4df6d8935b1b..a852943582a07a 100644
--- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -316,7 +316,7 @@ define i1 @add_ultcmp_bad_i8_i16(i16 %x) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0xffff
; CHECK-NEXT: add w8, w8, #128
-; CHECK-NEXT: lsr w0, w8, #16
+; CHECK-NEXT: ubfx w0, w8, #16, #1
; CHECK-NEXT: ret
%tmp0 = add i16 %x, 128 ; 1U << (8-1)
%tmp1 = icmp ult i16 %tmp0, 128 ; 1U << (8-1)
diff --git a/llvm/test/CodeGen/AArch64/srem-pow2.ll b/llvm/test/CodeGen/AArch64/srem-pow2.ll
index 4c114d185997e1..2fff31acb9d897 100644
--- a/llvm/test/CodeGen/AArch64/srem-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/srem-pow2.ll
@@ -33,8 +33,8 @@ define i16 @fold_srem_2_i16(i16 %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0x8000
; CHECK-NEXT: add w8, w0, w8, lsr #15
-; CHECK-NEXT: and w8, w8, #0xfffffffe
-; CHECK-NEXT: sub w0, w0, w8
+; CHECK-NEXT: sbfx w8, w8, #1, #15
+; CHECK-NEXT: sub w0, w0, w8, lsl #1
; CHECK-NEXT: ret
%1 = srem i16 %x, 2
ret i16 %1
@@ -66,10 +66,11 @@ define i16 @fold_srem_pow2_i16(i16 %x) {
; CHECK-LABEL: fold_srem_pow2_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: ubfx w8, w8, #25, #6
-; CHECK-NEXT: add w8, w0, w8
-; CHECK-NEXT: and w8, w8, #0xffffffc0
-; CHECK-NEXT: sub w0, w0, w8
+; CHECK-NEXT: lsr w8, w8, #15
+; CHECK-NEXT: and w8, w8, #0xfc00
+; CHECK-NEXT: add w8, w0, w8, lsr #10
+; CHECK-NEXT: sbfx w8, w8, #6, #10
+; CHECK-NEXT: sub w0, w0, w8, lsl #6
; CHECK-NEXT: ret
%1 = srem i16 %x, 64
ret i16 %1
@@ -103,10 +104,11 @@ define i16 @fold_srem_smax_i16(i16 %x) {
; CHECK-LABEL: fold_srem_smax_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: ubfx w8, w8, #16, #15
-; CHECK-NEXT: add w8, w0, w8
-; CHECK-NEXT: and w8, w8, #0xffff8000
-; CHECK-NEXT: add w0, w0, w8
+; CHECK-NEXT: lsr w8, w8, #15
+; CHECK-NEXT: and w8, w8, #0xfffe
+; CHECK-NEXT: add w8, w0, w8, lsr #1
+; CHECK-NEXT: ubfx w8, w8, #15, #1
+; CHECK-NEXT: add w0, w0, w8, lsl #15
; CHECK-NEXT: ret
%1 = srem i16 %x, 32768
ret i16 %1
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
index 595991e86a91c7..dbce51aaf2f8f5 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
@@ -25,8 +25,9 @@ define i1 @test_srem_even(i4 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: sbfx w8, w0, #0, #4
; CHECK-NEXT: add w8, w8, w8, lsl #1
-; CHECK-NEXT: ubfx w9, w8, #7, #1
-; CHECK-NEXT: add w8, w9, w8, lsr #4
+; CHECK-NEXT: lsr w8, w8, #4
+; CHECK-NEXT: and w9, w8, #0x8
+; CHECK-NEXT: add w8, w8, w9, lsr #3
; CHECK-NEXT: mov w9, #6 // =0x6
; CHECK-NEXT: msub w8, w8, w9, w0
; CHECK-NEXT: and w8, w8, #0xf
@@ -42,10 +43,11 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; CHECK-LABEL: test_srem_pow2_setne:
; CHECK: // %bb.0:
; CHECK-NEXT: sbfx w8, w0, #0, #6
-; CHECK-NEXT: ubfx w8, w8, #9, #2
-; CHECK-NEXT: add w8, w0, w8
-; CHECK-NEXT: and w8, w8, #0x3c
-; CHECK-NEXT: sub w8, w0, w8
+; CHECK-NEXT: lsr w8, w8, #5
+; CHECK-NEXT: and w8, w8, #0x30
+; CHECK-NEXT: add w8, w0, w8, lsr #4
+; CHECK-NEXT: lsr w8, w8, #2
+; CHECK-NEXT: sub w8, w0, w8, lsl #2
; CHECK-NEXT: tst w8, #0x3f
; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
index c0c0ae5c9d1fe9..8e7dfb5f85ebab 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
@@ -185,9 +185,10 @@ define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
; CHECK-NEXT: cmlt v1.4s, v0.4s, #0
; CHECK-NEXT: mov v2.16b, v0.16b
; CHECK-NEXT: usra v2.4s, v1.4s, #28
+; CHECK-NEXT: sshr v1.4s, v2.4s, #4
+; CHECK-NEXT: shl v1.4s, v1.4s, #4
+; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: bic v2.4s, #15
-; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -201,11 +202,11 @@ define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_int_min:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmlt v2.4s, v0.4s, #0
-; CHECK-NEXT: mov v3.16b, v0.16b
-; CHECK-NEXT: movi v1.4s, #128, lsl #24
-; CHECK-NEXT: usra v3.4s, v2.4s, #1
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: cmlt v1.4s, v0.4s, #0
+; CHECK-NEXT: mov v2.16b, v0.16b
+; CHECK-NEXT: usra v2.4s, v1.4s, #1
+; CHECK-NEXT: cmlt v1.4s, v2.4s, #0
+; CHECK-NEXT: shl v1.4s, v1.4s, #31
; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
diff --git a/llvm/test/CodeGen/AArch64/sshl_sat.ll b/llvm/test/CodeGen/AArch64/sshl_sat.ll
index fbcd2db1298f0b..af35954e21ba2b 100644
--- a/llvm/test/CodeGen/AArch64/sshl_sat.ll
+++ b/llvm/test/CodeGen/AArch64/sshl_sat.ll
@@ -131,7 +131,8 @@ define void @combine_shlsat_vector() nounwind {
define i16 @combine_shlsat_to_shl(i16 %x) nounwind {
; CHECK-LABEL: combine_shlsat_to_shl:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w0, w0, #0xfffffffc
+; CHECK-NEXT: sbfx w8, w0, #2, #14
+; CHECK-NEXT: lsl w0, w8, #2
; CHECK-NEXT: ret
%x2 = ashr i16 %x, 2
%tmp = call i16 @llvm.sshl.sat.i16(i16 %x2, i16 2)
@@ -142,14 +143,14 @@ define i16 @combine_shlsat_to_shl(i16 %x) nounwind {
define i16 @combine_shlsat_to_shl_no_fold(i16 %x) nounwind {
; CHECK-LABEL: combine_shlsat_to_shl_no_fold:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: mov w9, #-65536 // =0xffff0000
-; CHECK-NEXT: mov w10, #-2147483648 // =0x80000000
-; CHECK-NEXT: ands w8, w9, w8, lsl #14
-; CHECK-NEXT: cinv w10, w10, ge
-; CHECK-NEXT: lsl w9, w8, #3
-; CHECK-NEXT: cmp w8, w9, asr #3
-; CHECK-NEXT: csel w8, w10, w9, ne
+; CHECK-NEXT: sbfx w9, w0, #2, #14
+; CHECK-NEXT: mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT: lsl w10, w9, #16
+; CHECK-NEXT: lsl w9, w9, #19
+; CHECK-NEXT: cmp w10, #0
+; CHECK-NEXT: cinv w8, w8, ge
+; CHECK-NEXT: cmp w10, w9, asr #3
+; CHECK-NEXT: csel w8, w8, w9, ne
; CHECK-NEXT: asr w0, w8, #16
; CHECK-NEXT: ret
%x2 = ashr i16 %x, 2
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index a8c1276eadc4fa..1aa2f0edd7b8e6 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -192,12 +192,14 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-LABEL: v2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: add x9, x1, #1
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-NEXT: ldrsb w8, [x0]
+; CHECK-NEXT: ldrsb w9, [x1]
+; CHECK-NEXT: ldrsb w10, [x0, #1]
+; CHECK-NEXT: ldrsb w11, [x1, #1]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: mov v0.s[1], w10
+; CHECK-NEXT: mov v1.s[1], w11
; CHECK-NEXT: shl v1.2s, v1.2s, #24
; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s
@@ -232,12 +234,14 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldrsh w8, [x0]
+; CHECK-SD-NEXT: ldrsh w9, [x1]
+; CHECK-SD-NEXT: ldrsh w10, [x0, #2]
+; CHECK-SD-NEXT: ldrsh w11, [x1, #2]
+; CHECK-SD-NEXT: fmov s0, w8
+; CHECK-SD-NEXT: fmov s1, w9
+; CHECK-SD-NEXT: mov v0.s[1], w10
+; CHECK-SD-NEXT: mov v1.s[1], w11
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16
; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/storepairsuppress.ll b/llvm/test/CodeGen/AArch64/storepairsuppress.ll
index 0571bbc278a6f4..b977e62774e58d 100644
--- a/llvm/test/CodeGen/AArch64/storepairsuppress.ll
+++ b/llvm/test/CodeGen/AArch64/storepairsuppress.ll
@@ -168,10 +168,10 @@ define void @load_store_units_critical(ptr %arg, ptr noundef %arg1, i64 noundef
; SUPPRESS-NEXT: fsub s3, s5, s6
; SUPPRESS-NEXT: fsub s2, s16, s2
; SUPPRESS-NEXT: stp s3, s2, [x8, #40]
-; SUPPRESS-NEXT: lsl x9, x3, #33
+; SUPPRESS-NEXT: sbfiz x9, x3, #1, #31
; SUPPRESS-NEXT: ldr x10, [x0, #8]
-; SUPPRESS-NEXT: add x9, x10, x9, asr #29
-; SUPPRESS-NEXT: ldp s2, s3, [x9]
+; SUPPRESS-NEXT: add x10, x10, x9, lsl #3
+; SUPPRESS-NEXT: ldp s2, s3, [x10]
; SUPPRESS-NEXT: ldp s5, s6, [x8, #16]
; SUPPRESS-NEXT: fmul s16, s4, s3
; SUPPRESS-NEXT: fmul s3, s7, s3
@@ -183,7 +183,7 @@ define void @load_store_units_critical(ptr %arg, ptr noundef %arg1, i64 noundef
; SUPPRESS-NEXT: fsub s3, s5, s7
; SUPPRESS-NEXT: fsub s2, s6, s2
; SUPPRESS-NEXT: stp s3, s2, [x8, #48]
-; SUPPRESS-NEXT: add w9, w3, w3, lsl #1
+; SUPPRESS-NEXT: add w9, w9, w3
; SUPPRESS-NEXT: ldr x10, [x0, #8]
; SUPPRESS-NEXT: add x9, x10, w9, sxtw #3
; SUPPRESS-NEXT: ldp s2, s3, [x9]
@@ -337,10 +337,10 @@ define void @load_store_units_critical(ptr %arg, ptr noundef %arg1, i64 noundef
; NOSUPPRESS-NEXT: fsub s3, s5, s6
; NOSUPPRESS-NEXT: fsub s2, s16, s2
; NOSUPPRESS-NEXT: stp s3, s2, [x8, #40]
-; NOSUPPRESS-NEXT: lsl x9, x3, #33
+; NOSUPPRESS-NEXT: sbfiz x9, x3, #1, #31
; NOSUPPRESS-NEXT: ldr x10, [x0, #8]
-; NOSUPPRESS-NEXT: add x9, x10, x9, asr #29
-; NOSUPPRESS-NEXT: ldp s2, s3, [x9]
+; NOSUPPRESS-NEXT: add x10, x10, x9, lsl #3
+; NOSUPPRESS-NEXT: ldp s2, s3, [x10]
; NOSUPPRESS-NEXT: ldp s5, s6, [x8, #16]
; NOSUPPRESS-NEXT: fmul s16, s4, s3
; NOSUPPRESS-NEXT: fmul s3, s7, s3
@@ -352,7 +352,7 @@ define void @load_store_units_critical(ptr %arg, ptr noundef %arg1, i64 noundef
; NOSUPPRESS-NEXT: fsub s3, s5, s7
; NOSUPPRESS-NEXT: fsub s2, s6, s2
; NOSUPPRESS-NEXT: stp s3, s2, [x8, #48]
-; NOSUPPRESS-NEXT: add w9, w3, w3, lsl #1
+; NOSUPPRESS-NEXT: add w9, w9, w3
; NOSUPPRESS-NEXT: ldr x10, [x0, #8]
; NOSUPPRESS-NEXT: add x9, x10, w9, sxtw #3
; NOSUPPRESS-NEXT: ldp s2, s3, [x9]
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 0ad99008655184..f85aca97cb66a9 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -193,9 +193,10 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000
; CHECK-NEXT: and x10, x11, #0x1fffffffc
; CHECK-NEXT: dup v0.4s, w8
-; CHECK-NEXT: add x8, x1, x10, lsl #3
-; CHECK-NEXT: add x9, x0, x10, lsl #1
+; CHECK-NEXT: lsl x9, x10, #1
; CHECK-NEXT: mov x12, x10
+; CHECK-NEXT: add x8, x1, x9, lsl #2
+; CHECK-NEXT: add x9, x0, x9
; CHECK-NEXT: .LBB1_9: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld2 { v1.4s, v2.4s }, [x1], #32
@@ -593,9 +594,10 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: and x10, x11, #0x1fffffffc
; CHECK-NEXT: dup v0.4s, w8
; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI3_0]
-; CHECK-NEXT: add x8, x1, x10, lsl #4
-; CHECK-NEXT: add x9, x0, x10, lsl #2
+; CHECK-NEXT: lsl x9, x10, #2
; CHECK-NEXT: mov x12, x10
+; CHECK-NEXT: add x8, x1, x9, lsl #2
+; CHECK-NEXT: add x9, x0, x9
; CHECK-NEXT: .LBB3_9: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64
diff --git a/llvm/test/CodeGen/AArch64/ushl_sat.ll b/llvm/test/CodeGen/AArch64/ushl_sat.ll
index 870f80545f9993..609671a44bee8d 100644
--- a/llvm/test/CodeGen/AArch64/ushl_sat.ll
+++ b/llvm/test/CodeGen/AArch64/ushl_sat.ll
@@ -117,7 +117,8 @@ define void @combine_shlsat_vector() nounwind {
define i16 @combine_shlsat_to_shl(i16 %x) nounwind {
; CHECK-LABEL: combine_shlsat_to_shl:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w0, w0, #0xfffffffc
+; CHECK-NEXT: ubfx w8, w0, #2, #14
+; CHECK-NEXT: lsl w0, w8, #2
; CHECK-NEXT: ret
%x2 = lshr i16 %x, 2
%tmp = call i16 @llvm.ushl.sat.i16(i16 %x2, i16 2)
@@ -128,9 +129,9 @@ define i16 @combine_shlsat_to_shl(i16 %x) nounwind {
define i16 @combine_shlsat_to_shl_no_fold(i16 %x) nounwind {
; CHECK-LABEL: combine_shlsat_to_shl_no_fold:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsl w8, w0, #14
-; CHECK-NEXT: and w8, w8, #0x3fff0000
-; CHECK-NEXT: lsl w9, w8, #3
+; CHECK-NEXT: ubfx w8, w0, #2, #14
+; CHECK-NEXT: lsl w9, w8, #19
+; CHECK-NEXT: lsl w8, w8, #16
; CHECK-NEXT: cmp w8, w9, lsr #3
; CHECK-NEXT: csinv w8, w9, wzr, eq
; CHECK-NEXT: lsr w0, w8, #16
@@ -144,6 +145,8 @@ define i16 @combine_shlsat_to_shl_no_fold(i16 %x) nounwind {
define <2 x i16> @combine_shlsat_to_shl_vec(<2 x i8> %a) nounwind {
; CHECK-LABEL: combine_shlsat_to_shl_vec:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi d1, #0x0000ff000000ff
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: shl v0.2s, v0.2s, #8
; CHECK-NEXT: ret
%ext = zext <2 x i8> %a to <2 x i16>
diff --git a/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll
index 6525d6cd7458b5..0313c5eff464d8 100644
--- a/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll
@@ -126,7 +126,7 @@ define <16 x i8> @test_128_i8_x_16_28_mask_ashr_1(<16 x i8> %a0) {
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.16b, #28
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.16b, v0.16b, #1
+; CHECK-NEXT: sshr v0.16b, v0.16b, #1
; CHECK-NEXT: ret
%t0 = and <16 x i8> %a0, <i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28>
%t1 = ashr <16 x i8> %t0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -137,7 +137,7 @@ define <16 x i8> @test_128_i8_x_16_28_mask_ashr_2(<16 x i8> %a0) {
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.16b, #28
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.16b, v0.16b, #2
+; CHECK-NEXT: sshr v0.16b, v0.16b, #2
; CHECK-NEXT: ret
%t0 = and <16 x i8> %a0, <i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28>
%t1 = ashr <16 x i8> %t0, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
@@ -234,6 +234,8 @@ define <16 x i8> @test_128_i8_x_16_7_mask_shl_4(<16 x i8> %a0) {
define <16 x i8> @test_128_i8_x_16_7_mask_shl_5(<16 x i8> %a0) {
; CHECK-LABEL: test_128_i8_x_16_7_mask_shl_5:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.16b, #7
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.16b, v0.16b, #5
; CHECK-NEXT: ret
%t0 = and <16 x i8> %a0, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -435,7 +437,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_3(<8 x i16> %a0) {
; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.8h, v0.8h, #3
+; CHECK-NEXT: sshr v0.8h, v0.8h, #3
; CHECK-NEXT: ret
%t0 = and <8 x i16> %a0, <i16 2032, i16 2032, i16 2032, i16 2032, i16 2032, i16 2032, i16 2032, i16 2032>
%t1 = ashr <8 x i16> %t0, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -447,7 +449,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_4(<8 x i16> %a0) {
; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.8h, v0.8h, #4
+; CHECK-NEXT: sshr v0.8h, v0.8h, #4
; CHECK-NEXT: ret
%t0 = and <8 x i16> %a0, <i16 2032, i16 2032, i16 2032, i16 2032, i16 2032, i16 2032, i16 2032, i16 2032>
%t1 = ashr <8 x i16> %t0, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
@@ -546,6 +548,8 @@ define <8 x i16> @test_128_i16_x_8_127_mask_shl_8(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_127_mask_shl_9(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_127_mask_shl_9:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.8h, #127
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.8h, v0.8h, #9
; CHECK-NEXT: ret
%t0 = and <8 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
@@ -751,7 +755,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_7(<4 x i32> %a0) {
; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.4s, v0.4s, #7
+; CHECK-NEXT: sshr v0.4s, v0.4s, #7
; CHECK-NEXT: ret
%t0 = and <4 x i32> %a0, <i32 8388352, i32 8388352, i32 8388352, i32 8388352>
%t1 = ashr <4 x i32> %t0, <i32 7, i32 7, i32 7, i32 7>
@@ -763,7 +767,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_8(<4 x i32> %a0) {
; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.4s, v0.4s, #8
+; CHECK-NEXT: sshr v0.4s, v0.4s, #8
; CHECK-NEXT: ret
%t0 = and <4 x i32> %a0, <i32 8388352, i32 8388352, i32 8388352, i32 8388352>
%t1 = ashr <4 x i32> %t0, <i32 8, i32 8, i32 8, i32 8>
@@ -862,6 +866,8 @@ define <4 x i32> @test_128_i32_x_4_32767_mask_shl_16(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_32767_mask_shl_17(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_32767_mask_shl_17:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.4s, #127, msl #8
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.4s, v0.4s, #17
; CHECK-NEXT: ret
%t0 = and <4 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767>
@@ -1071,7 +1077,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_15(<2 x i64> %a0) {
; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.2d, v0.2d, #15
+; CHECK-NEXT: sshr v0.2d, v0.2d, #15
; CHECK-NEXT: ret
%t0 = and <2 x i64> %a0, <i64 140737488289792, i64 140737488289792>
%t1 = ashr <2 x i64> %t0, <i64 15, i64 15>
@@ -1083,7 +1089,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_16(<2 x i64> %a0) {
; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.2d, v0.2d, #16
+; CHECK-NEXT: sshr v0.2d, v0.2d, #16
; CHECK-NEXT: ret
%t0 = and <2 x i64> %a0, <i64 140737488289792, i64 140737488289792>
%t1 = ashr <2 x i64> %t0, <i64 16, i64 16>
@@ -1186,6 +1192,9 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_32(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_33(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_2147483647_mask_shl_33:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT: dup v1.2d, x8
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #33
; CHECK-NEXT: ret
%t0 = and <2 x i64> %a0, <i64 2147483647, i64 2147483647>
diff --git a/llvm/test/CodeGen/AArch64/win64_vararg.ll b/llvm/test/CodeGen/AArch64/win64_vararg.ll
index aaf4cad6087403..0d5f866b950cd4 100644
--- a/llvm/test/CodeGen/AArch64/win64_vararg.ll
+++ b/llvm/test/CodeGen/AArch64/win64_vararg.ll
@@ -185,7 +185,7 @@ define void @vla(i32, ptr, ...) local_unnamed_addr {
; CHECK-NEXT: str x8, [x29, #16]
; CHECK-NEXT: add x8, x9, #15
; CHECK-NEXT: mov x23, sp
-; CHECK-NEXT: lsr x15, x8, #4
+; CHECK-NEXT: ubfx x15, x8, #4, #29
; CHECK-NEXT: stp x2, x3, [x29, #24]
; CHECK-NEXT: stp x4, x5, [x29, #40]
; CHECK-NEXT: stp x6, x7, [x29, #56]
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 6f67ce4de9ce54..90103f71de098d 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -487,7 +487,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_add_u16_e32 v2, 32, v2
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -993,7 +994,8 @@ define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) {
; VI-LABEL: add_inline_imm_neg1_0:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_u16_e32 v0, -1, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -1023,7 +1025,8 @@ define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) {
; VI-LABEL: add_inline_imm_1_0:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_u16_e32 v0, 1, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 897e134ee48d83..21d4e439d0e089 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -156,6 +156,8 @@ define amdgpu_kernel void @anyext_v2i16_to_v2i32() #0 {
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GCN-NEXT: v_bfe_u32 v0, v0, 8, 1
; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 7108f3d65768cd..a530e26fa9e63b 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -81,7 +81,8 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: ; return to shader part epilog
%trunc = fptrunc float %src to bfloat
%ext = fpext bfloat %trunc to float
@@ -100,35 +101,39 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
; GCN-NEXT: v_add_u32_e32 v4, v6, v4
; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GCN-NEXT: s_brev_b32 s4, 1
-; GCN-NEXT: v_and_or_b32 v5, v1, s4, v4
-; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
-; GCN-NEXT: s_movk_i32 s5, 0x7fff
-; GCN-NEXT: v_add3_u32 v4, v4, v5, s5
+; GCN-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc
+; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v1
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_lshrrev_b64 v[6:7], 32, v[4:5]
+; GCN-NEXT: v_or_b32_e32 v5, v8, v6
+; GCN-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GCN-NEXT: s_movk_i32 s4, 0x7fff
+; GCN-NEXT: v_add3_u32 v6, v6, v5, s4
; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
; GCN-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]|
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v5
-; GCN-NEXT: v_and_b32_e32 v6, 1, v5
+; GCN-NEXT: v_and_b32_e32 v7, 1, v5
; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1]
; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1]
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GCN-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
; GCN-NEXT: v_add_u32_e32 v0, v5, v0
; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN-NEXT: v_and_or_b32 v1, v3, s4, v0
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 1
-; GCN-NEXT: v_add3_u32 v0, v0, v1, s5
-; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; GCN-NEXT: v_cndmask_b32_e32 v7, v0, v5, vcc
+; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v3
+; GCN-NEXT: v_lshrrev_b64 v[0:1], 32, v[4:5]
+; GCN-NEXT: v_or_b32_e32 v0, v7, v0
+; GCN-NEXT: v_bfe_u32 v1, v7, 16, 1
+; GCN-NEXT: v_add3_u32 v1, v1, v0, s4
+; GCN-NEXT: v_or_b32_e32 v0, 0x400000, v0
; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
; GCN-NEXT: s_mov_b32 s0, 0x7060302
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT: v_perm_b32 v0, v0, v4, s0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT: v_perm_b32 v0, v0, v6, s0
; GCN-NEXT: ; return to shader part epilog
%res = fptrunc <2 x double> %src to <2 x bfloat>
%cast = bitcast <2 x bfloat> %res to float
@@ -271,16 +276,18 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
; GCN-NEXT: v_add_u32_e32 v4, v6, v4
; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GCN-NEXT: s_brev_b32 s0, 1
-; GCN-NEXT: v_and_or_b32 v5, v1, s0, v4
-; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT: v_cndmask_b32_e32 v6, v4, v6, vcc
+; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v1
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_lshrrev_b64 v[4:5], 32, v[4:5]
+; GCN-NEXT: v_or_b32_e32 v4, v6, v4
+; GCN-NEXT: v_bfe_u32 v5, v6, 16, 1
; GCN-NEXT: s_movk_i32 s0, 0x7fff
-; GCN-NEXT: v_add3_u32 v4, v4, v5, s0
-; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT: v_add3_u32 v5, v5, v4, s0
+; GCN-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
; GCN-NEXT: s_endpgm
entry:
@@ -299,19 +306,21 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT: v_xor_b32_e32 v6, 0x80000000, v1
; GCN-NEXT: v_add_u32_e32 v4, v7, v4
; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT: s_brev_b32 s4, 1
-; GCN-NEXT: v_xor_b32_e32 v6, 0x80000000, v1
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; GCN-NEXT: v_and_or_b32 v5, v6, s4, v4
-; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT: v_cndmask_b32_e32 v7, v4, v7, vcc
+; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v6
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_lshrrev_b64 v[4:5], 32, v[4:5]
+; GCN-NEXT: v_or_b32_e32 v4, v7, v4
+; GCN-NEXT: v_bfe_u32 v5, v7, 16, 1
; GCN-NEXT: s_movk_i32 s0, 0x7fff
-; GCN-NEXT: v_add3_u32 v4, v4, v5, s0
-; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT: v_add3_u32 v5, v5, v4, s0
+; GCN-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GCN-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
; GCN-NEXT: s_endpgm
entry:
@@ -331,19 +340,21 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1
; GCN-NEXT: v_add_u32_e32 v4, v7, v4
; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; GCN-NEXT: s_brev_b32 s0, 1
-; GCN-NEXT: v_and_or_b32 v5, v6, s0, v4
-; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT: v_cndmask_b32_e32 v7, v4, v7, vcc
+; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v6
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_lshrrev_b64 v[4:5], 32, v[4:5]
+; GCN-NEXT: v_or_b32_e32 v4, v7, v4
+; GCN-NEXT: v_bfe_u32 v5, v7, 16, 1
; GCN-NEXT: s_movk_i32 s0, 0x7fff
-; GCN-NEXT: v_add3_u32 v4, v4, v5, s0
-; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT: v_add3_u32 v5, v5, v4, s0
+; GCN-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GCN-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
; GCN-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 4c9c34de7194ce..38c590c17507a7 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -80,10 +80,11 @@ define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v2bf16:
@@ -93,10 +94,11 @@ define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v2bf16:
@@ -141,8 +143,9 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v3bf16:
@@ -155,8 +158,9 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v3bf16:
@@ -198,12 +202,14 @@ define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v4bf16:
@@ -213,12 +219,14 @@ define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v4bf16:
@@ -260,14 +268,17 @@ define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v6bf16:
@@ -277,14 +288,17 @@ define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: buffer_load_dwordx3 v[5:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v6bf16:
@@ -326,16 +340,20 @@ define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v8bf16:
@@ -345,16 +363,20 @@ define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v8bf16:
@@ -396,26 +418,34 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[19:22], v[0:1], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v16bf16:
@@ -425,26 +455,34 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[15:18], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx4 v[19:22], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v16bf16:
@@ -501,46 +539,62 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[19:22], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[23:26], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dwordx4 v[31:34], v[0:1], s[4:7], 0 addr64 offset:48
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v18
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v22
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
-; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
-; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
-; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
-; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
-; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
-; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v32bf16:
@@ -550,46 +604,62 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_load_dwordx4 v[17:20], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx4 v[21:24], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_load_dwordx4 v[31:34], v[0:1], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v20
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v24
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
-; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
-; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
-; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
-; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
-; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v26
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v33
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v34
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v32bf16:
@@ -2273,19 +2343,21 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v6, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX8-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX8-NEXT: v_and_b32_e32 v8, 1, v6
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], 32, v[6:7]
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v8
+; GFX8-NEXT: v_and_b32_e32 v7, 1, v8
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, v[4:5]
; GFX8-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[0:1]|, v[4:5]
; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[4:5]
-; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v6, v4
+; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v8, v4
; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT: v_or_b32_e32 v5, v4, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT: v_or_b32_e32 v5, v4, v6
; GFX8-NEXT: v_bfe_u32 v4, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
@@ -2301,23 +2373,25 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_brev_b32 s8, 1
-; GFX9-NEXT: s_movk_i32 s9, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_movk_i32 s8, 0x7fff
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX9-NEXT: v_and_b32_e32 v7, 1, v6
+; GFX9-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
+; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v1
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], 32, v[6:7]
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v8
+; GFX9-NEXT: v_and_b32_e32 v7, 1, v8
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
-; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
+; GFX9-NEXT: v_add_u32_e32 v4, v8, v4
; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4
+; GFX9-NEXT: v_or_b32_e32 v5, v4, v6
; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1
-; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9
+; GFX9-NEXT: v_add3_u32 v4, v4, v5, s8
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
@@ -2335,16 +2409,19 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5]
; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
-; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_and_b32_e32 v5, 0x80000000, v1
+; GFX10-NEXT: v_lshrrev_b64 v[4:5], 32, v[4:5]
+; GFX10-NEXT: v_cndmask_b32_e64 v8, -1, 1, s5
; GFX10-NEXT: s_or_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v7, v6, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc_lo
; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
-; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
-; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1
-; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT: v_bfe_u32 v5, v5, 16, 1
+; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc_lo
; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -2354,26 +2431,29 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GFX11-NEXT: v_and_b32_e32 v7, 1, v6
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
-; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11-NEXT: v_and_b32_e32 v5, 0x80000000, v1
+; GFX11-NEXT: v_lshrrev_b64 v[4:5], 32, v[4:5]
+; GFX11-NEXT: v_cndmask_b32_e64 v8, -1, 1, s1
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v7, v6, v8
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
-; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
-; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v5, v5, 16, 1
+; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load double, ptr addrspace(1) %in
@@ -5704,10 +5784,11 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v2bf16_to_v2f32:
@@ -5717,10 +5798,11 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v2bf16_to_v2f32:
@@ -5729,7 +5811,8 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
; GFX8-NEXT: flat_load_dword v1, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v2bf16_to_v2f32:
@@ -5738,25 +5821,29 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
; GFX9-NEXT: global_load_dword v1, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v2bf16_to_v2f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v[0:1], off
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v2bf16_to_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v[0:1], off
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <2 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <2 x bfloat> %load to <2 x float>
@@ -5771,11 +5858,12 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v3bf16_to_v3f32:
@@ -5785,11 +5873,12 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v3bf16_to_v3f32:
@@ -5798,8 +5887,9 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
@@ -5808,8 +5898,9 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
@@ -5817,9 +5908,10 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v3bf16_to_v3f32:
@@ -5827,9 +5919,11 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <3 x bfloat> %load to <3 x float>
@@ -5844,12 +5938,14 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v4bf16_to_v4f32:
@@ -5859,56 +5955,67 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v4bf16_to_v4f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[3:4], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v4bf16_to_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <4 x bfloat> %load to <4 x float>
@@ -5923,15 +6030,17 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v5bf16_to_v5f32:
@@ -5941,39 +6050,45 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v5bf16_to_v5f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
+; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[5:8], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
@@ -5981,11 +6096,13 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
@@ -5993,11 +6110,13 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <5 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <5 x bfloat> %load to <5 x float>
@@ -6012,14 +6131,17 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v6bf16_to_v6f32:
@@ -6029,66 +6151,81 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v6bf16_to_v6f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx3 v[3:5], v[0:1]
+; GFX8-NEXT: flat_load_dwordx3 v[5:7], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx3 v[3:5], v[0:1], off
+; GFX9-NEXT: global_load_dwordx3 v[5:7], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx3 v[3:5], v[0:1], off
+; GFX10-NEXT: global_load_dwordx3 v[1:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b96 v[3:5], v[0:1], off
+; GFX11-NEXT: global_load_b96 v[1:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <6 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <6 x bfloat> %load to <6 x float>
@@ -6103,16 +6240,20 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v8bf16_to_v8f32:
@@ -6122,76 +6263,96 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v8bf16_to_v8f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[5:8], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT: global_load_b128 v[5:8], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <8 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <8 x bfloat> %load to <8 x float>
@@ -6206,26 +6367,34 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v16bf16_to_v16f32:
@@ -6235,130 +6404,170 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v16bf16_to_v16f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
+; GFX8-NEXT: flat_load_dwordx4 v[15:18], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v10
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX9-NEXT: global_load_dwordx4 v[15:18], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v18
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX10-NEXT: global_load_dwordx4 v[15:18], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v18
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
-; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16
+; GFX11-NEXT: global_load_b128 v[15:18], v[0:1], off
+; GFX11-NEXT: global_load_b128 v[19:22], v[0:1], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v18
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <16 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <16 x bfloat> %load to <16 x float>
@@ -6373,46 +6582,62 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dwordx4 v[26:29], v[0:1], s[4:7], 0 addr64 offset:48
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v10
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
-; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
-; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
-; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
-; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
-; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
-; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v35
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v34
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v33
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v39
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v38
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v37
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v36
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v32bf16_to_v32f32:
@@ -6422,46 +6647,62 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_load_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx4 v[26:29], v[0:1], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v21
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v10
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
-; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
-; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
-; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
-; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
-; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v36, 16, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v38, 16, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v39, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v35
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v34
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v33
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v39
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v38
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v37
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v36
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v32bf16_to_v32f32:
@@ -6469,187 +6710,251 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[2:3]
+; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
+; GFX8-NEXT: flat_load_dwordx4 v[15:18], v[2:3]
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[2:3]
-; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[0:1]
+; GFX8-NEXT: flat_load_dwordx4 v[23:26], v[2:3]
+; GFX8-NEXT: flat_load_dwordx4 v[31:34], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v10
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v18
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v18
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v21
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
-; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v23
-; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v26
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v28
-; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
-; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v29
-; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
-; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v30
-; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
-; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v31
-; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v31
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v31
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v32
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v33
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v32
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v33
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v34
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48
+; GFX9-NEXT: global_load_dwordx4 v[15:18], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:16
+; GFX9-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:32
+; GFX9-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:48
; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v18
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v20
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v21
-; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
-; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
-; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v23
-; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v26
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v28
-; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
-; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v29
-; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
-; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v30
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
-; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v31
-; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v33
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v32
+; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v33
+; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v34
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
-; GFX10-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32
-; GFX10-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48
+; GFX10-NEXT: global_load_dwordx4 v[31:34], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:16
+; GFX10-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:32
+; GFX10-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:48
; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v31
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v31
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v32
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v33
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v34
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v35
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v36
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v38
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v20
-; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v21
-; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v23
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v48
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v49
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v50
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v51
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v28
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v29
-; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v30
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v31
-; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v52
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v53
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v54
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v34
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v36
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v38
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v48
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v50
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v52
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v54
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v55
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
-; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16
-; GFX11-NEXT: global_load_b128 v[20:23], v[0:1], off offset:32
-; GFX11-NEXT: global_load_b128 v[28:31], v[0:1], off offset:48
+; GFX11-NEXT: global_load_b128 v[31:34], v[0:1], off
+; GFX11-NEXT: global_load_b128 v[35:38], v[0:1], off offset:16
+; GFX11-NEXT: global_load_b128 v[48:51], v[0:1], off offset:32
+; GFX11-NEXT: global_load_b128 v[52:55], v[0:1], off offset:48
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v31
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v31
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v32
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v33
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v34
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v35
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v36
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v37
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v38
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v48
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v49
+; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v50
+; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v51
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v28
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
-; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
-; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v30
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
-; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v31
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v52
+; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v53
+; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v54
+; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v32
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v33
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v34
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v35
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v36
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v37
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v38
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v48
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v49
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v50
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v51
+; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v52
+; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v53
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v54
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v55
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <32 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <32 x bfloat> %load to <32 x float>
@@ -6666,9 +6971,10 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -6679,10 +6985,11 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -6690,22 +6997,24 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
; GFX8-LABEL: global_extload_v2bf16_to_v2f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -6714,9 +7023,10 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -6725,10 +7035,12 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <2 x bfloat>, ptr addrspace(1) %ptr
@@ -6746,12 +7058,13 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
-; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v3bf16_to_v3f64:
@@ -6761,40 +7074,43 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v3bf16_to_v3f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
@@ -6802,12 +7118,13 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v3bf16_to_v3f64:
@@ -6815,14 +7132,15 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <3 x bfloat> %load to <3 x double>
@@ -6839,13 +7157,15 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
-; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -6858,13 +7178,15 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -6874,12 +7196,14 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -6889,44 +7213,51 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v4bf16_to_v4f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x bfloat>, ptr addrspace(1) %ptr
@@ -6942,42 +7273,46 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
-; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
-; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
-; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
-; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
-; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: global_extload_v5bf16_to_v5f64:
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v5bf16_to_v5f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -6986,16 +7321,18 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
@@ -7003,50 +7340,57 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <5 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <5 x bfloat> %load to <5 x double>
@@ -7063,17 +7407,20 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
-; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
-; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
-; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -7086,17 +7433,20 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -7106,16 +7456,19 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -7125,54 +7478,63 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
+; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v6bf16_to_v6f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b96 v[4:6], v[0:1], off
+; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <6 x bfloat>, ptr addrspace(1) %ptr
@@ -7190,21 +7552,25 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
-; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
-; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
-; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
-; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v10
+; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -7217,21 +7583,25 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v3
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v10
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -7240,21 +7610,25 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v3
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v6
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -7263,67 +7637,79 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v3
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v3
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v6
; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v11
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v7
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v8bf16_to_v8f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v11
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v7
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <8 x bfloat>, ptr addrspace(1) %ptr
@@ -7339,42 +7725,50 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v7
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
-; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v8
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
-; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v9
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v5
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v7
+; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v10
+; GCN-NEXT: v_cvt_f64_f32_e32 v[20:21], v3
+; GCN-NEXT: v_cvt_f64_f32_e32 v[24:25], v22
+; GCN-NEXT: v_cvt_f64_f32_e32 v[28:29], v23
; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
-; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
-; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
-; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
-; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
-; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
-; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GCN-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
-; GCN-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GCN-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
-; GCN-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
-; GCN-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
-; GCN-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
-; GCN-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
+; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v19
+; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v18
+; GCN-NEXT: v_cvt_f64_f32_e32 v[18:19], v26
+; GCN-NEXT: v_cvt_f64_f32_e32 v[22:23], v27
+; GCN-NEXT: v_cvt_f64_f32_e32 v[26:27], v30
+; GCN-NEXT: v_cvt_f64_f32_e32 v[30:31], v31
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v16bf16_to_v16f64:
@@ -7384,42 +7778,50 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v6
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v8
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v9
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v5
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v10
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v7
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v3
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[24:25], v22
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[28:29], v23
; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v19
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v18
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v26
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v27
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[26:27], v30
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[30:31], v31
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v16bf16_to_v16f64:
@@ -7430,38 +7832,46 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v7
-; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
-; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v8
-; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v9
-; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v9
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v17
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v14
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[20:21], v18
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[24:25], v19
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[28:29], v22
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v23
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[22:23], v26
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[26:27], v27
; GFX8-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -7472,38 +7882,46 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v3
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v6
-; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v7
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
-; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v8
-; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v9
-; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v9
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v11
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v23
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v15
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[24:25], v18
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[28:29], v19
; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v10
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v26
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v27
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[26:27], v30
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[30:31], v31
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
@@ -7511,40 +7929,48 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[9:12], v[0:1], off offset:16
+; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v11
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v9
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v11
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[16:17], v14
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[24:25], v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v27
; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[20:21], v15
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[28:29], v19
; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v10
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX10-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
; GFX10-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
; GFX10-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
; GFX10-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -7552,41 +7978,49 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off
-; GFX11-NEXT: global_load_b128 v[23:26], v[0:1], off offset:16
+; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
+; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v24
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v24
-; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v25
-; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v26
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v14
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v10
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v32
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v7
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[16:17], v11
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v19
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX11-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
; GFX11-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v27
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <16 x bfloat>, ptr addrspace(1) %ptr
@@ -8948,10 +9382,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_bf16:
@@ -8959,10 +9396,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_bf16:
@@ -9037,31 +9477,45 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_add_f32_e32 v1, v1, v3
; GCN-NEXT: v_add_f32_e32 v0, v0, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v2bf16:
@@ -9071,9 +9525,11 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
@@ -9094,9 +9550,11 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
@@ -9116,49 +9574,53 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX10-LABEL: v_fadd_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_add_f32_e32 v2, v3, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_add_f32_e32 v1, v3, v2
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v1 :: v_dual_add_f32 v1, v3, v2
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <2 x bfloat> %a, %b
ret <2 x bfloat> %op
@@ -9170,45 +9632,65 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_add_f32_e32 v2, v2, v5
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_add_f32_e32 v1, v1, v4
+; GCN-NEXT: v_add_f32_e32 v2, v2, v5
; GCN-NEXT: v_add_f32_e32 v0, v0, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v3bf16:
@@ -9227,10 +9709,12 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
@@ -9261,9 +9745,11 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
@@ -9283,31 +9769,33 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX10-LABEL: v_fadd_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX10-NEXT: v_add_f32_e32 v2, v5, v4
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%op = fadd <3 x bfloat> %a, %b
@@ -9320,57 +9808,83 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_add_f32_e32 v1, v1, v5
; GCN-NEXT: v_add_f32_e32 v3, v3, v7
; GCN-NEXT: v_add_f32_e32 v2, v2, v6
-; GCN-NEXT: v_add_f32_e32 v1, v1, v5
; GCN-NEXT: v_add_f32_e32 v0, v0, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v4bf16:
@@ -9380,9 +9894,11 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_add_f32_e32 v4, v5, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
@@ -9399,9 +9915,11 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
@@ -9424,9 +9942,11 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
@@ -9441,9 +9961,11 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
@@ -9465,81 +9987,88 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_add_f32_e32 v4, v5, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_add_f32_e32 v3, v7, v6
-; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
+; GFX10-NEXT: v_add_f32_e32 v2, v5, v6
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
-; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v5, v6
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <4 x bfloat> %a, %b
ret <4 x bfloat> %op
@@ -9551,105 +10080,157 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_add_f32_e32 v5, v5, v13
+; GCN-NEXT: v_add_f32_e32 v1, v1, v9
; GCN-NEXT: v_add_f32_e32 v7, v7, v15
; GCN-NEXT: v_add_f32_e32 v6, v6, v14
-; GCN-NEXT: v_add_f32_e32 v5, v5, v13
; GCN-NEXT: v_add_f32_e32 v4, v4, v12
; GCN-NEXT: v_add_f32_e32 v3, v3, v11
; GCN-NEXT: v_add_f32_e32 v2, v2, v10
-; GCN-NEXT: v_add_f32_e32 v1, v1, v9
; GCN-NEXT: v_add_f32_e32 v0, v0, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v15
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v14
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v5, v5, v13
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v12
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v11
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v10
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v9
; GFX7-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v8bf16:
@@ -9659,9 +10240,11 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX8-NEXT: v_add_f32_e32 v8, v9, v8
; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
@@ -9678,9 +10261,11 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX8-NEXT: v_add_f32_e32 v7, v9, v7
; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
@@ -9696,9 +10281,11 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX8-NEXT: v_add_f32_e32 v6, v9, v6
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_add_f32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
@@ -9714,9 +10301,11 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX8-NEXT: v_add_f32_e32 v5, v9, v5
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
@@ -9743,9 +10332,11 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
@@ -9760,9 +10351,11 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_add_f32_e32 v2, v2, v6
; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
@@ -9776,9 +10369,11 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_f32_e32 v6, v9, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v5
; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
@@ -9792,9 +10387,11 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v5, v9, v5
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
@@ -9818,155 +10415,171 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_add_f32_e32 v8, v9, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
-; GFX10-NEXT: v_add_f32_e32 v7, v10, v9
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_add_f32_e32 v9, v10, v9
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_add3_u32 v7, v11, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_add_f32_e32 v6, v10, v6
-; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX10-NEXT: v_add3_u32 v8, v11, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v12, v9, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_add_f32_e32 v9, v12, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v14, v2, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
-; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_add_f32_e32 v5, v15, v13
-; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_add3_u32 v5, v12, v9, 0x7fff
+; GFX10-NEXT: v_add_f32_e32 v4, v15, v14
; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v9
; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v12, v0, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_bfe_u32 v14, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
-; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
-; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
+; GFX10-NEXT: v_add3_u32 v9, v11, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v12, v0, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
+; GFX10-NEXT: v_add3_u32 v12, v14, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v4
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v6, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v12, v14, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v1, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v7, 16, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT: v_add_f32_e32 v7, v10, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
-; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT: v_add_f32_e32 v9, v10, v9
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-NEXT: v_add3_u32 v7, v11, v8, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
-; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_add_f32 v2, v2, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v6, v10, v6
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v8, v11, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v10, v12, v9, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_add_f32 v2, v2, v6
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v10, v14, v2, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
-; GFX11-NEXT: v_add_f32_e32 v5, v15, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v9, v12, v11
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add_f32_e32 v4, v15, v14
; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v5, v12, v9, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v12, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v14, v4, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v11, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v10, v12, v0, 0x7fff
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
+; GFX11-NEXT: v_add3_u32 v12, v14, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v4
+; GFX11-NEXT: v_perm_b32 v2, v2, v6, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v14, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v1, v5, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <8 x bfloat> %a, %b
ret <8 x bfloat> %op
@@ -9976,207 +10589,311 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-LABEL: v_fadd_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_add_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_add_f32_e32 v13, v13, v29
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_add_f32_e32 v12, v12, v28
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_add_f32_e32 v11, v11, v27
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_add_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_add_f32_e32 v9, v9, v25
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_add_f32_e32 v8, v8, v24
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_add_f32_e32 v7, v7, v23
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_add_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_add_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_add_f32_e32 v1, v1, v17
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_add_f32_e32 v14, v14, v17
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_add_f32_e32 v12, v12, v17
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_add_f32_e32 v11, v11, v17
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_add_f32_e32 v10, v10, v17
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_add_f32_e32 v8, v8, v17
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_add_f32_e32 v7, v7, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_add_f32_e32 v4, v4, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_add_f32_e32 v6, v6, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_add_f32_e32 v3, v3, v19
-; GCN-NEXT: v_add_f32_e32 v2, v2, v18
-; GCN-NEXT: v_add_f32_e32 v1, v1, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_add_f32_e32 v4, v4, v19
+; GCN-NEXT: v_add_f32_e32 v3, v3, v18
+; GCN-NEXT: v_add_f32_e32 v2, v2, v17
; GCN-NEXT: v_add_f32_e32 v0, v0, v16
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GCN-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_add_f32_e32 v15, v15, v16
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_add_f32_e32 v29, v13, v29
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_add_f32_e32 v25, v9, v13
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v21, v5, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v17
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v31, v1, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v26
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_add_f32_e32 v14, v14, v30
-; GFX7-NEXT: v_add_f32_e32 v13, v13, v29
-; GFX7-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_add_f32_e32 v11, v11, v27
-; GFX7-NEXT: v_add_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_add_f32_e32 v9, v9, v25
-; GFX7-NEXT: v_add_f32_e32 v8, v8, v24
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v21
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v20
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v19
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v18
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v17
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v16
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v19
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_add_f32_e32 v11, v11, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_add_f32_e32 v15, v15, v22
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v28
+; GFX7-NEXT: v_add_f32_e32 v15, v15, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v27
+; GFX7-NEXT: v_add_f32_e32 v14, v14, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v31
+; GFX7-NEXT: v_add_f32_e32 v12, v12, v25
+; GFX7-NEXT: v_add_f32_e32 v10, v10, v23
+; GFX7-NEXT: v_add_f32_e32 v8, v8, v22
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v16
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v13
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v21, 0, 16
+; GFX7-NEXT: v_bfe_u32 v5, v20, 0, 16
+; GFX7-NEXT: v_bfe_u32 v9, v18, 0, 16
+; GFX7-NEXT: v_bfe_u32 v13, v17, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v16bf16:
@@ -10186,10 +10903,12 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX8-NEXT: v_add_f32_e32 v16, v17, v16
; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v7, v7, v15
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
@@ -10205,9 +10924,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX8-NEXT: v_add_f32_e32 v15, v17, v15
; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v6, v6, v14
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
@@ -10223,9 +10944,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX8-NEXT: v_add_f32_e32 v14, v17, v14
; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v5, v5, v13
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
@@ -10241,9 +10964,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX8-NEXT: v_add_f32_e32 v13, v17, v13
; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
-; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v4, v4, v12
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
@@ -10259,9 +10984,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX8-NEXT: v_add_f32_e32 v12, v17, v12
; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v3, v3, v11
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
@@ -10277,9 +11004,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX8-NEXT: v_add_f32_e32 v11, v17, v11
; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v2, v2, v10
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
@@ -10295,9 +11024,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX8-NEXT: v_add_f32_e32 v10, v17, v10
; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v1, v1, v9
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
@@ -10313,9 +11044,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX8-NEXT: v_add_f32_e32 v9, v17, v9
; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v0, v0, v8
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
@@ -10350,9 +11083,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v7, v7, v15
@@ -10367,9 +11102,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_add_f32_e32 v15, v17, v15
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_add_f32_e32 v6, v6, v14
; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
@@ -10383,9 +11120,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_add_f32_e32 v14, v17, v14
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_add_f32_e32 v5, v5, v13
; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
@@ -10399,9 +11138,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_add_f32_e32 v13, v17, v13
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_add_f32_e32 v4, v4, v12
; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
@@ -10415,9 +11156,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_add_f32_e32 v12, v17, v12
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_add_f32_e32 v3, v3, v11
; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
@@ -10431,9 +11174,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_add_f32_e32 v11, v17, v11
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_add_f32_e32 v2, v2, v10
; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
@@ -10447,9 +11192,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_f32_e32 v10, v17, v10
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v9
; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
@@ -10463,9 +11210,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v9, v17, v9
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v8
; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
@@ -10493,139 +11242,155 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_add_f32_e32 v16, v17, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
-; GFX10-NEXT: v_add_f32_e32 v7, v7, v15
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
-; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v7, v7, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX10-NEXT: v_add_f32_e32 v17, v18, v17
-; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
-; GFX10-NEXT: v_add_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v15, v18, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v20, vcc_lo
; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
-; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
-; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT: v_bfe_u32 v21, v15, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
-; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
-; GFX10-NEXT: v_add_f32_e32 v17, v20, v19
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_add3_u32 v17, v21, v15, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v20, v6, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v14, v18, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v6
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
; GFX10-NEXT: v_add_f32_e32 v5, v5, v13
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
-; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v17, v19, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v20, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v14, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
-; GFX10-NEXT: v_add_f32_e32 v13, v19, v18
-; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v19, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_add_f32_e32 v13, v20, v13
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
+; GFX10-NEXT: v_add3_u32 v18, v18, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX10-NEXT: v_add_f32_e32 v4, v4, v12
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v18, v20, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_add_f32_e32 v12, v18, v12
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_add_f32_e32 v12, v18, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v11
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX10-NEXT: v_bfe_u32 v18, v12, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
-; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
-; GFX10-NEXT: v_add_f32_e32 v18, v19, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v12
+; GFX10-NEXT: v_add3_u32 v18, v18, v12, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v10
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v22, v3, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v11, v19, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v18, v21, vcc_lo
+; GFX10-NEXT: v_add3_u32 v19, v22, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
-; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
-; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
-; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
-; GFX10-NEXT: v_add_f32_e32 v19, v22, v20
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_add3_u32 v18, v18, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_add3_u32 v17, v20, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v18, v22, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_add_f32_e32 v18, v19, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX10-NEXT: v_add_f32_e32 v1, v1, v9
-; GFX10-NEXT: v_add_f32_e32 v9, v22, v20
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX10-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v8, v23, v18, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_add_f32_e32 v19, v22, v19
+; GFX10-NEXT: v_bfe_u32 v22, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v23, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
-; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
-; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
+; GFX10-NEXT: v_add3_u32 v18, v22, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX10-NEXT: v_bfe_u32 v9, v19, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v11, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v18, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
+; GFX10-NEXT: v_add3_u32 v9, v9, v19, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
+; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v21, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v20, vcc_lo
; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -10633,156 +11398,170 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e32 v17, v18, v17
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-NEXT: v_add_f32_e32 v7, v7, v15
-; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
-; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_add_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v14, 16, v14
; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v21, v15, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v14
+; GFX11-NEXT: v_add_f32_e32 v16, v17, v16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v16, v17, v20, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: v_add3_u32 v17, v21, v15, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v6, 16, 1
; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_add_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
-; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v15, v17, v19, vcc_lo
+; GFX11-NEXT: v_add3_u32 v17, v20, v6, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_add_f32_e32 v14, v18, v14
+; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v6
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v4, v4, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_bfe_u32 v19, v14, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add_f32_e32 v5, v5, v13
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_add_f32 v13, v19, v18
-; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-NEXT: v_add3_u32 v17, v19, v14, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v13, v20, v13
; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e32 v12, v18, v12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_add3_u32 v18, v18, v5, 0x7fff
; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v18, v20, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
-; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v18, v19, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_dual_add_f32 v12, v18, v12 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-NEXT: v_add3_u32 v17, v20, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v12
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
; GFX11-NEXT: v_add_f32_e32 v3, v3, v11
-; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX11-NEXT: v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v22, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_add_f32 v11, v19, v11 :: v_dual_cndmask_b32 v12, v18, v21
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
+; GFX11-NEXT: v_add3_u32 v19, v22, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v11
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v18, v18, v11, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_lshlrev_b32 v19, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
-; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
-; GFX11-NEXT: v_add_f32_e32 v19, v22, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v8 :: v_dual_add_f32 v1, v1, v9
-; GFX11-NEXT: v_add_f32_e32 v9, v22, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
-; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
-; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v18, v22, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v11, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_add_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_add3_u32 v11, v11, v2, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v8 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v23, v18, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT: v_bfe_u32 v23, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v0
+; GFX11-NEXT: v_add_f32_e32 v19, v22, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v23, v23, v0, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v9
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v18
+; GFX11-NEXT: v_bfe_u32 v22, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_bfe_u32 v9, v19, 16, 1
+; GFX11-NEXT: v_add3_u32 v18, v22, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v9, v9, v19, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v18, v22, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v23, v24, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v11, v21, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v20, vcc_lo
; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <16 x bfloat> %a, %b
@@ -10793,527 +11572,735 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-LABEL: v_fadd_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_add_f32_e32 v29, v29, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
-; GCN-NEXT: v_add_f32_e32 v31, v31, v32
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_add_f32_e32 v25, v25, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
-; GCN-NEXT: v_add_f32_e32 v30, v30, v32
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_add_f32_e32 v21, v21, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_add_f32_e32 v17, v17, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_add_f32_e32 v13, v13, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_add_f32_e32 v9, v9, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_add_f32_e32 v5, v5, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_add_f32_e32 v31, v1, v31
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
-; GCN-NEXT: v_add_f32_e32 v29, v29, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_add_f32_e32 v1, v1, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_add_f32_e32 v30, v30, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v28, v28, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v27, v27, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v26, v26, v32
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
-; GCN-NEXT: v_add_f32_e32 v25, v25, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v24, v24, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v23, v23, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v22, v22, v32
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
-; GCN-NEXT: v_add_f32_e32 v21, v21, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v20, v20, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v19, v19, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v18, v18, v32
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
-; GCN-NEXT: v_add_f32_e32 v17, v17, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v16, v16, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v15, v15, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v14, v14, v32
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
-; GCN-NEXT: v_add_f32_e32 v13, v13, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v12, v12, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v11, v11, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v10, v10, v32
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_add_f32_e32 v9, v9, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v8, v8, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v7, v7, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v6, v6, v32
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_add_f32_e32 v5, v5, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v4, v4, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v3, v3, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v2, v2, v32
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_add_f32_e32 v1, v1, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_add_f32_e32 v0, v0, v32
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GCN-NEXT: v_bfe_u32 v1, v32, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GCN-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GCN-NEXT: v_bfe_u32 v17, v17, 0, 16
+; GCN-NEXT: v_bfe_u32 v21, v21, 0, 16
+; GCN-NEXT: v_bfe_u32 v25, v25, 0, 16
+; GCN-NEXT: v_bfe_u32 v29, v29, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_add_f32_e32 v29, v29, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_bfe_u32 v29, v29, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_add_f32_e32 v31, v31, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_add_f32_e32 v25, v25, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_bfe_u32 v25, v25, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_add_f32_e32 v30, v30, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_add_f32_e32 v29, v29, v32
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_add_f32_e32 v21, v21, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_bfe_u32 v21, v21, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_add_f32_e32 v17, v17, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_bfe_u32 v17, v17, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_add_f32_e32 v13, v13, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_add_f32_e32 v9, v9, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_add_f32_e32 v31, v1, v31
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_add_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v26, v26, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_add_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v22, v22, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_add_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v18, v18, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_add_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v14, v14, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_add_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v10, v10, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_add_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v6, v6, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v2, v2, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_add_f32_e32 v0, v0, v32
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v32, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v32bf16:
@@ -11323,10 +12310,12 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_add_f32_e32 v31, v32, v31
; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
-; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
; GFX8-NEXT: v_add_f32_e32 v14, v14, v30
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
@@ -11343,17 +12332,21 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_add_f32_e32 v32, v32, v30
; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_add_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GFX8-NEXT: v_add_f32_e32 v33, v33, v34
-; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX8-NEXT: v_add_f32_e32 v30, v15, v30
; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
@@ -11382,9 +12375,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
; GFX8-NEXT: v_add_f32_e32 v29, v33, v29
; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
-; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v12, v12, v28
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
@@ -11400,9 +12395,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX8-NEXT: v_add_f32_e32 v28, v33, v28
; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
-; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v11, v11, v27
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
@@ -11418,9 +12415,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX8-NEXT: v_add_f32_e32 v27, v33, v27
; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
-; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v10, v10, v26
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
@@ -11436,9 +12435,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX8-NEXT: v_add_f32_e32 v26, v33, v26
; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
-; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v9, v9, v25
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
@@ -11454,9 +12455,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX8-NEXT: v_add_f32_e32 v25, v33, v25
; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v8, v8, v24
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
@@ -11472,9 +12475,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX8-NEXT: v_add_f32_e32 v24, v33, v24
; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
-; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v7, v7, v23
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
@@ -11490,9 +12495,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX8-NEXT: v_add_f32_e32 v23, v33, v23
; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
-; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v6, v6, v22
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
@@ -11508,9 +12515,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX8-NEXT: v_add_f32_e32 v22, v33, v22
; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v5, v5, v21
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
@@ -11526,9 +12535,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX8-NEXT: v_add_f32_e32 v21, v33, v21
; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v4, v4, v20
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
@@ -11544,9 +12555,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX8-NEXT: v_add_f32_e32 v20, v33, v20
; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v3, v3, v19
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
@@ -11562,9 +12575,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX8-NEXT: v_add_f32_e32 v19, v33, v19
; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
-; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v2, v2, v18
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
@@ -11580,9 +12595,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX8-NEXT: v_add_f32_e32 v18, v33, v18
; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v1, v1, v17
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
@@ -11598,9 +12615,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX8-NEXT: v_add_f32_e32 v17, v33, v17
; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
-; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v0, v0, v16
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
@@ -11649,9 +12668,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX9-NEXT: v_add_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX9-NEXT: v_add_f32_e32 v14, v14, v30
@@ -11666,59 +12687,67 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_add_f32_e32 v13, v13, v29
+; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
; GFX9-NEXT: v_add_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v13, v13, v29
; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT: v_add_f32_e32 v32, v32, v29
-; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT: v_add_f32_e32 v33, v33, v34
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_add_f32_e32 v32, v32, v33
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX9-NEXT: v_add_f32_e32 v29, v15, v29
-; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
+; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v32
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc
+; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v29
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc
+; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1
+; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v13
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_add_f32_e32 v32, v33, v32
+; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
+; GFX9-NEXT: v_add_f32_e32 v12, v12, v28
; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX9-NEXT: v_add_f32_e32 v28, v33, v28
-; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX9-NEXT: v_add_f32_e32 v11, v11, v27
; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
@@ -11732,9 +12761,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX9-NEXT: v_add_f32_e32 v27, v33, v27
-; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX9-NEXT: v_add_f32_e32 v10, v10, v26
; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
@@ -11748,9 +12779,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX9-NEXT: v_add_f32_e32 v26, v33, v26
-; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX9-NEXT: v_add_f32_e32 v9, v9, v25
; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
@@ -11764,9 +12797,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_add_f32_e32 v25, v33, v25
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX9-NEXT: v_add_f32_e32 v8, v8, v24
; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
@@ -11780,9 +12815,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_add_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX9-NEXT: v_add_f32_e32 v7, v7, v23
; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
@@ -11796,9 +12833,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_add_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX9-NEXT: v_add_f32_e32 v6, v6, v22
; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
@@ -11812,9 +12851,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_add_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX9-NEXT: v_add_f32_e32 v5, v5, v21
; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
@@ -11828,9 +12869,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_add_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX9-NEXT: v_add_f32_e32 v4, v4, v20
; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
@@ -11844,9 +12887,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_add_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX9-NEXT: v_add_f32_e32 v3, v3, v19
; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
@@ -11860,9 +12905,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_add_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX9-NEXT: v_add_f32_e32 v2, v2, v18
; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
@@ -11876,9 +12923,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v17
; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
@@ -11892,9 +12941,11 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v16
; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
@@ -11929,557 +12980,627 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v21
; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v19
; GFX10-NEXT: v_add_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_add_f32_e32 v53, v54, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v17
+; GFX10-NEXT: v_add_f32_e32 v55, v64, v55
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX10-NEXT: v_add_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_add_f32_e32 v10, v10, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_add_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_add_f32_e32 v25, v54, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v18
+; GFX10-NEXT: v_add_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_add_f32_e32 v65, v66, v65
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v16
+; GFX10-NEXT: v_add_f32_e32 v68, v68, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_bfe_u32 v21, v55, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v67
+; GFX10-NEXT: v_add_f32_e32 v34, v36, v34
+; GFX10-NEXT: v_add_f32_e32 v36, v48, v38
+; GFX10-NEXT: v_add_f32_e32 v38, v52, v50
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_add_f32_e32 v48, v64, v54
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v52, v33, 16, 1
; GFX10-NEXT: v_add_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_add_f32_e32 v24, v64, v55
; GFX10-NEXT: v_add_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_add_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_add_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_bfe_u32 v23, v53, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v55
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_bfe_u32 v19, v65, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_bfe_u32 v17, v68, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX10-NEXT: v_add3_u32 v21, v21, v55, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v33
+; GFX10-NEXT: v_add_f32_e32 v14, v14, v15
+; GFX10-NEXT: v_bfe_u32 v15, v35, 16, 1
; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v22, v30, v22
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v53
; GFX10-NEXT: v_add_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_add_f32_e32 v18, v27, v48
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_add_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v65
; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_add_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_add_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_add_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_add_f32_e32 v51, v52, v51
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_add_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_add_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
+; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v68
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
+; GFX10-NEXT: v_add3_u32 v33, v52, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v52, v34, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v53, v53
+; GFX10-NEXT: v_add3_u32 v23, v23, v53, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v53, v48, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v65, v65
+; GFX10-NEXT: v_add3_u32 v19, v19, v65, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v68, v68
+; GFX10-NEXT: v_add3_u32 v17, v17, v68, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v20, v21, v20, s10
+; GFX10-NEXT: v_bfe_u32 v21, v8, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_add_f32_e32 v50, v69, v66
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v35
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v37
+; GFX10-NEXT: v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_bfe_u32 v28, v39, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_bfe_u32 v27, v49, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v35, v35
+; GFX10-NEXT: v_add3_u32 v15, v15, v35, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v34
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v37, v37
+; GFX10-NEXT: v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v34, v34
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v48, v48
+; GFX10-NEXT: v_add3_u32 v34, v52, v34, 0x7fff
+; GFX10-NEXT: v_add3_u32 v48, v53, v48, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v30, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v19, v18, s11
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v17, v16, s12
+; GFX10-NEXT: v_bfe_u32 v17, v7, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v8, v8
+; GFX10-NEXT: v_add3_u32 v8, v21, v8, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v21, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v39
+; GFX10-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v49
+; GFX10-NEXT: v_add_f32_e32 v9, v9, v25
+; GFX10-NEXT: v_bfe_u32 v25, v51, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v39, v39
+; GFX10-NEXT: v_add3_u32 v28, v28, v39, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v49, v49
+; GFX10-NEXT: v_add3_u32 v27, v27, v49, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v38, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v36, v36
+; GFX10-NEXT: v_bfe_u32 v52, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v37, v36, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v34, v34, v35, s13
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
-; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v48, v48, v55, s16
+; GFX10-NEXT: v_bfe_u32 v55, v5, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v7, v7
+; GFX10-NEXT: v_add3_u32 v7, v17, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v1, v1
+; GFX10-NEXT: v_add3_u32 v1, v21, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v51
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v51, v51
+; GFX10-NEXT: v_add3_u32 v25, v25, v51, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v38
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v38, v38
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v14
+; GFX10-NEXT: v_add3_u32 v38, v49, v38, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v36, v36, v39, s14
+; GFX10-NEXT: v_bfe_u32 v39, v22, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v52, v14, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v5, v5
+; GFX10-NEXT: v_add3_u32 v5, v55, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s16
+; GFX10-NEXT: v_bfe_u32 v65, v50, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v38, v38, v51, s15
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v22
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v22, v22
+; GFX10-NEXT: v_add3_u32 v22, v39, v22, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v52, s12
+; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v50
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v50, v50
+; GFX10-NEXT: v_bfe_u32 v49, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v65, v50, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v12, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v67, s6
+; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v19, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v19, v22, v51, s11
+; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v13
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v64, s4
+; GFX10-NEXT: v_bfe_u32 v64, v11, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v66, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v26, v27, v26, s7
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v10
+; GFX10-NEXT: v_cndmask_b32_e64 v24, v25, v24, s8
+; GFX10-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v50, v50, v68, s17
+; GFX10-NEXT: v_bfe_u32 v68, v4, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v49, v13, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v65, v12, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v3, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v67, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v39, v0, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s10
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v11
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v9
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v64, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v9, v9
+; GFX10-NEXT: v_add3_u32 v9, v25, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v0
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v4, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v0, v0
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v2, v2
+; GFX10-NEXT: v_add3_u32 v4, v68, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v65, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v67, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v0, v39, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v37, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v53, s4
+; GFX10-NEXT: v_perm_b32 v7, v7, v20, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v54, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v66, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v27, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v30, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v49, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v55, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v64, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v25, s17
+; GFX10-NEXT: v_perm_b32 v1, v1, v48, 0x7060302
+; GFX10-NEXT: v_perm_b32 v4, v4, v34, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v0, v50, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v36, 0x7060302
+; GFX10-NEXT: v_perm_b32 v2, v2, v38, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT: v_perm_b32 v10, v10, v26, 0x7060302
+; GFX10-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
+; GFX10-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v13, v15, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
-; GFX10-NEXT: v_add_f32_e32 v17, v31, v17
-; GFX10-NEXT: v_add_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
-; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_add_f32_e32 v21, v31, v21
+; GFX10-NEXT: v_add_f32_e32 v16, v6, v17
+; GFX10-NEXT: v_perm_b32 v6, v19, v18, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v17, v21, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v21
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
+; GFX10-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v16, v16
+; GFX10-NEXT: v_add3_u32 v16, v19, v16, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v20, s4
+; GFX10-NEXT: v_perm_b32 v15, v16, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_b32 v32, off, s32
-; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
-; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v18
; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v6
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v3, v3, v19
; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
-; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v18
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v16
-; GFX11-NEXT: v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
; GFX11-NEXT: v_add_f32_e32 v7, v7, v23
-; GFX11-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_add_f32 v18, v84, v83
-; GFX11-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
-; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
-; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
-; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
-; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v4, v4, v20
-; GFX11-NEXT: v_add_f32_e32 v20, v80, v71
-; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27
+; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT: v_add_f32_e32 v26, v52, v51
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v22
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v29, v38, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v14, v14, v30
-; GFX11-NEXT: v_add_f32_e32 v28, v48, v39
-; GFX11-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
-; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
-; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
-; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
+; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v21
+; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_dual_add_f32 v33, v34, v33 :: v_dual_add_f32 v34, v36, v35
+; GFX11-NEXT: v_add_f32_e32 v35, v38, v37
+; GFX11-NEXT: v_add_f32_e32 v37, v50, v49
+; GFX11-NEXT: v_dual_add_f32 v49, v66, v65 :: v_dual_add_f32 v50, v68, v67
+; GFX11-NEXT: v_dual_add_f32 v38, v52, v51 :: v_dual_add_f32 v51, v70, v69
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v23, v49, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_bfe_u32 v27, v37, 16, 1
+; GFX11-NEXT: v_bfe_u32 v21, v51, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_add3_u32 v23, v23, v49, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_add3_u32 v21, v21, v51, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: v_add_f32_e32 v36, v48, v39
+; GFX11-NEXT: v_or_b32_e32 v69, 0x400000, v37
+; GFX11-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX11-NEXT: v_add_f32_e32 v48, v64, v55
+; GFX11-NEXT: v_bfe_u32 v64, v33, 16, 1
+; GFX11-NEXT: v_bfe_u32 v28, v36, 16, 1
+; GFX11-NEXT: v_bfe_u32 v26, v38, 16, 1
+; GFX11-NEXT: v_add3_u32 v27, v27, v37, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_or_b32_e32 v65, 0x400000, v33
+; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v36
+; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v38
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v17
+; GFX11-NEXT: v_dual_add_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT: v_bfe_u32 v30, v34, 16, 1
+; GFX11-NEXT: v_add3_u32 v16, v64, v33, 0x7fff
+; GFX11-NEXT: v_add3_u32 v28, v28, v36, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_add3_u32 v26, v26, v38, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
-; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
-; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
-; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v34
+; GFX11-NEXT: v_add3_u32 v30, v30, v34, 0x7fff
+; GFX11-NEXT: v_add_f32_e32 v13, v13, v29
+; GFX11-NEXT: v_bfe_u32 v29, v35, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v65, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v35
+; GFX11-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
+; GFX11-NEXT: v_add_f32_e32 v52, v80, v71
+; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v66, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-NEXT: v_add_f32_e32 v39, v54, v53
+; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_bfe_u32 v20, v52, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v29, v29, v67, vcc_lo
+; GFX11-NEXT: v_add_f32_e32 v54, v84, v83
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-NEXT: v_add_f32_e32 v55, v86, v85
+; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v52
+; GFX11-NEXT: v_add3_u32 v20, v20, v52, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v18, v54, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v54
+; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v68, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-NEXT: v_add3_u32 v18, v18, v54, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v39
+; GFX11-NEXT: v_bfe_u32 v24, v48, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v27, v27, v69, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-NEXT: v_add_f32_e32 v9, v9, v25
+; GFX11-NEXT: v_bfe_u32 v25, v39, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v48
+; GFX11-NEXT: v_add3_u32 v24, v24, v48, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v70, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-NEXT: v_add3_u32 v25, v25, v39, 0x7fff
+; GFX11-NEXT: v_add_f32_e32 v53, v82, v81
+; GFX11-NEXT: v_or_b32_e32 v81, 0x400000, v49
+; GFX11-NEXT: v_bfe_u32 v22, v50, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v50
+; GFX11-NEXT: v_cndmask_b32_e32 v25, v25, v71, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v51
+; GFX11-NEXT: v_add3_u32 v22, v22, v50, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v53, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v85, 0x400000, v53
+; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v80, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-NEXT: v_bfe_u32 v17, v55, 16, 1
+; GFX11-NEXT: v_add3_u32 v19, v19, v53, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v87, 0x400000, v55
+; GFX11-NEXT: v_bfe_u32 v64, v14, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v23, v23, v81, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-NEXT: v_add3_u32 v17, v17, v55, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v14
+; GFX11-NEXT: v_bfe_u32 v97, v13, 16, 1
+; GFX11-NEXT: v_add3_u32 v64, v64, v14, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v82, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v13
+; GFX11-NEXT: v_bfe_u32 v99, v12, 16, 1
+; GFX11-NEXT: v_add3_u32 v34, v97, v13, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v21, v21, v83, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-NEXT: v_bfe_u32 v101, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v35, v99, v12, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v11
+; GFX11-NEXT: v_bfe_u32 v103, v10, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v84, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-NEXT: v_add3_u32 v36, v101, v11, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v10
+; GFX11-NEXT: v_bfe_u32 v113, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v37, v103, v10, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v85, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v115, v8, 16, 1
+; GFX11-NEXT: v_add3_u32 v38, v113, v9, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v117, v7, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v18, v18, v86, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-NEXT: v_add3_u32 v39, v115, v8, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v7
+; GFX11-NEXT: v_bfe_u32 v119, v6, 16, 1
+; GFX11-NEXT: v_add3_u32 v48, v117, v7, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v87, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
-; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
-; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
-; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
-; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v129, v5, 16, 1
+; GFX11-NEXT: v_add3_u32 v49, v119, v6, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v5
+; GFX11-NEXT: v_bfe_u32 v131, v4, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v64, v96, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
-; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
-; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
+; GFX11-NEXT: v_add3_u32 v50, v129, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v133, v3, 16, 1
+; GFX11-NEXT: v_add3_u32 v51, v131, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v13, v34, v98, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
-; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
-; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
-; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v52, v133, v3, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v145, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v12, v35, v100, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
-; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
-; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
+; GFX11-NEXT: v_add3_u32 v55, v147, v0, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v54, v145, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v135, v2, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v11, v36, v102, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
-; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
-; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
+; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT: v_add3_u32 v53, v135, v2, 0x7fff
+; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v37, v112, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
-; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
+; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v38, v114, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v39, v116, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v48, v118, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
+; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v49, v128, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v50, v130, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v51, v132, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v52, v134, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v55, v33, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v32
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v54, v146, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
-; GFX11-NEXT: v_add_f32_e32 v15, v15, v18
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v33
+; GFX11-NEXT: v_dual_add_f32 v17, v31, v17 :: v_dual_cndmask_b32 v2, v53, v144
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v15, v15, v18
; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <32 x bfloat> %a, %b
@@ -12491,18 +13612,22 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_bf16_fpimm_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_bf16_fpimm_0:
@@ -12570,18 +13695,22 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_add_f32_e32 v0, 0x42280000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_bf16_fpimm_1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v0, 0x42280000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_bf16_fpimm_1:
@@ -12650,10 +13779,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fsub_bf16:
@@ -12661,10 +13793,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fsub_bf16:
@@ -12739,31 +13874,45 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_sub_f32_e32 v1, v1, v3
; GCN-NEXT: v_sub_f32_e32 v0, v0, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fsub_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fsub_v2bf16:
@@ -12773,9 +13922,11 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
@@ -12796,9 +13947,11 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -12818,49 +13971,53 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX10-LABEL: v_fsub_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_sub_f32_e32 v2, v3, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_sub_f32_e32 v1, v3, v2
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fsub_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_sub_f32_e32 v2, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_sub_f32 v1, v3, v2
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fsub <2 x bfloat> %a, %b
ret <2 x bfloat> %op
@@ -12872,45 +14029,65 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_sub_f32_e32 v1, v1, v4
+; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
; GCN-NEXT: v_sub_f32_e32 v0, v0, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fsub_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_sub_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fsub_v3bf16:
@@ -12929,10 +14106,12 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
@@ -12963,9 +14142,11 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
@@ -12985,31 +14166,33 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX10-LABEL: v_fsub_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX10-NEXT: v_sub_f32_e32 v2, v5, v4
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%op = fsub <3 x bfloat> %a, %b
@@ -13022,57 +14205,83 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_sub_f32_e32 v1, v1, v5
; GCN-NEXT: v_sub_f32_e32 v3, v3, v7
; GCN-NEXT: v_sub_f32_e32 v2, v2, v6
-; GCN-NEXT: v_sub_f32_e32 v1, v1, v5
; GCN-NEXT: v_sub_f32_e32 v0, v0, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fsub_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_sub_f32_e32 v3, v3, v7
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_sub_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fsub_v4bf16:
@@ -13082,9 +14291,11 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_sub_f32_e32 v4, v5, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
@@ -13101,9 +14312,11 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
@@ -13126,9 +14339,11 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
@@ -13143,9 +14358,11 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
@@ -13167,81 +14384,88 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_sub_f32_e32 v3, v7, v6
-; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
+; GFX10-NEXT: v_sub_f32_e32 v2, v5, v6
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fsub_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v1, v1, v3
-; GFX11-NEXT: v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
-; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_f32_e32 v2, v5, v6
+; GFX11-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fsub <4 x bfloat> %a, %b
ret <4 x bfloat> %op
@@ -13253,10 +14477,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_bf16:
@@ -13264,10 +14491,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_bf16:
@@ -13342,31 +14572,45 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v2bf16:
@@ -13376,9 +14620,11 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
@@ -13399,9 +14645,11 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
@@ -13421,49 +14669,53 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX10-LABEL: v_fmul_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v1, v3, v2
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmul_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mul_f32 v0, v0, v1 :: v_dual_mul_f32 v1, v3, v2
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <2 x bfloat> %a, %b
ret <2 x bfloat> %op
@@ -13475,45 +14727,65 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v3bf16:
@@ -13532,10 +14804,12 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
@@ -13566,9 +14840,11 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
@@ -13588,31 +14864,33 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX10-LABEL: v_fmul_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v2, v5, v4
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%op = fmul <3 x bfloat> %a, %b
@@ -13625,57 +14903,83 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v4bf16:
@@ -13685,9 +14989,11 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
@@ -13704,9 +15010,11 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
@@ -13729,9 +15037,11 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
@@ -13746,9 +15056,11 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
@@ -13770,81 +15082,88 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
-; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
+; GFX10-NEXT: v_mul_f32_e32 v2, v5, v6
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmul_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT: v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
-; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v2, v5, v6
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <4 x bfloat> %a, %b
ret <4 x bfloat> %op
@@ -13856,105 +15175,157 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v13
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v9
; GCN-NEXT: v_mul_f32_e32 v7, v7, v15
; GCN-NEXT: v_mul_f32_e32 v6, v6, v14
-; GCN-NEXT: v_mul_f32_e32 v5, v5, v13
; GCN-NEXT: v_mul_f32_e32 v4, v4, v12
; GCN-NEXT: v_mul_f32_e32 v3, v3, v11
; GCN-NEXT: v_mul_f32_e32 v2, v2, v10
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v9
; GCN-NEXT: v_mul_f32_e32 v0, v0, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v7, v7, v15
-; GFX7-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX7-NEXT: v_mul_f32_e32 v7, v7, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, v4, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v5, v5, v13
-; GFX7-NEXT: v_mul_f32_e32 v4, v4, v12
-; GFX7-NEXT: v_mul_f32_e32 v3, v3, v11
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v10
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v9
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v8bf16:
@@ -13964,9 +15335,11 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX8-NEXT: v_mul_f32_e32 v8, v9, v8
; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_mul_f32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
@@ -13983,9 +15356,11 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX8-NEXT: v_mul_f32_e32 v7, v9, v7
; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
@@ -14001,9 +15376,11 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v6, v9, v6
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
@@ -14019,9 +15396,11 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v5, v9, v5
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
@@ -14048,9 +15427,11 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7
@@ -14065,9 +15446,11 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6
; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
@@ -14081,9 +15464,11 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
@@ -14097,9 +15482,11 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
@@ -14123,155 +15510,171 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
-; GFX10-NEXT: v_mul_f32_e32 v7, v10, v9
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_mul_f32_e32 v9, v10, v9
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_add3_u32 v7, v11, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_mul_f32_e32 v6, v10, v6
-; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX10-NEXT: v_add3_u32 v8, v11, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v12, v9, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_mul_f32_e32 v9, v12, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v14, v2, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
-; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_mul_f32_e32 v5, v15, v13
-; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_add3_u32 v5, v12, v9, 0x7fff
+; GFX10-NEXT: v_mul_f32_e32 v4, v15, v14
; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v9
; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v12, v0, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_bfe_u32 v14, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
-; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
-; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
+; GFX10-NEXT: v_add3_u32 v9, v11, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v12, v0, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
+; GFX10-NEXT: v_add3_u32 v12, v14, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v4
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v6, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v12, v14, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v1, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmul_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v7, 16, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-NEXT: v_mul_f32_e32 v3, v3, v7
-; GFX11-NEXT: v_mul_f32_e32 v7, v10, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
-; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT: v_mul_f32_e32 v9, v10, v9
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-NEXT: v_add3_u32 v7, v11, v8, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
-; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_mul_f32 v2, v2, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_mul_f32_e32 v6, v10, v6
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v8, v11, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v10, v12, v9, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mul_f32 v1, v1, v5 :: v_dual_mul_f32 v2, v2, v6
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v10, v14, v2, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
-; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT: v_dual_mul_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
-; GFX11-NEXT: v_mul_f32_e32 v5, v15, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT: v_mul_f32_e32 v9, v12, v11
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f32_e32 v4, v15, v14
; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v5, v12, v9, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v12, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v14, v4, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v11, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v10, v12, v0, 0x7fff
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
+; GFX11-NEXT: v_add3_u32 v12, v14, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v4
+; GFX11-NEXT: v_perm_b32 v2, v2, v6, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v14, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v1, v5, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <8 x bfloat> %a, %b
ret <8 x bfloat> %op
@@ -14281,220 +15684,326 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-LABEL: v_fmul_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_mul_f32_e32 v13, v13, v29
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v12, v12, v28
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v11, v11, v27
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v17
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_mul_f32_e32 v14, v14, v17
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_mul_f32_e32 v12, v12, v17
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_mul_f32_e32 v11, v11, v17
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_mul_f32_e32 v10, v10, v17
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_mul_f32_e32 v8, v8, v17
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_mul_f32_e32 v7, v7, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v6, v6, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v3, v3, v19
-; GCN-NEXT: v_mul_f32_e32 v2, v2, v18
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v4, v4, v19
+; GCN-NEXT: v_mul_f32_e32 v3, v3, v18
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v17
; GCN-NEXT: v_mul_f32_e32 v0, v0, v16
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GCN-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_mul_f32_e32 v15, v15, v16
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v29, v13, v29
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v25, v9, v13
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v21, v5, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v17
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v31, v1, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30
-; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29
-; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
-; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25
-; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24
-; GFX7-NEXT: v_mul_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_mul_f32_e32 v5, v5, v21
-; GFX7-NEXT: v_mul_f32_e32 v4, v4, v20
-; GFX7-NEXT: v_mul_f32_e32 v3, v3, v19
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v18
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v17
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v16bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v26
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_mul_f32_e32 v7, v7, v19
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v11, v11, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v28
+; GFX7-NEXT: v_mul_f32_e32 v15, v15, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v27
+; GFX7-NEXT: v_mul_f32_e32 v14, v14, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v12, v12, v25
+; GFX7-NEXT: v_mul_f32_e32 v10, v10, v23
+; GFX7-NEXT: v_mul_f32_e32 v8, v8, v22
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v16
+; GFX7-NEXT: v_mul_f32_e32 v4, v4, v13
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v21, 0, 16
+; GFX7-NEXT: v_bfe_u32 v5, v20, 0, 16
+; GFX7-NEXT: v_bfe_u32 v9, v18, 0, 16
+; GFX7-NEXT: v_bfe_u32 v13, v17, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v7, v7, v15
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
@@ -14510,9 +16019,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX8-NEXT: v_mul_f32_e32 v15, v17, v15
; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v6, v6, v14
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
@@ -14528,9 +16039,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX8-NEXT: v_mul_f32_e32 v14, v17, v14
; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v5, v5, v13
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
@@ -14546,9 +16059,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX8-NEXT: v_mul_f32_e32 v13, v17, v13
; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
-; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v4, v4, v12
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
@@ -14564,9 +16079,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX8-NEXT: v_mul_f32_e32 v12, v17, v12
; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v3, v3, v11
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
@@ -14582,9 +16099,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX8-NEXT: v_mul_f32_e32 v11, v17, v11
; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v2, v2, v10
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
@@ -14600,9 +16119,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v10, v17, v10
; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v9
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
@@ -14618,9 +16139,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v9, v17, v9
; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v8
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
@@ -14655,9 +16178,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15
@@ -14672,9 +16197,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14
; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
@@ -14688,9 +16215,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13
; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
@@ -14704,9 +16233,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12
; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
@@ -14720,9 +16251,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11
; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
@@ -14736,9 +16269,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10
; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
@@ -14752,9 +16287,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9
; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
@@ -14768,9 +16305,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8
; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
@@ -14798,139 +16337,155 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_mul_f32_e32 v16, v17, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
-; GFX10-NEXT: v_mul_f32_e32 v7, v7, v15
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
-; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v7, v7, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17
-; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
-; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v15, v18, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v20, vcc_lo
; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
-; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
-; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT: v_bfe_u32 v21, v15, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
-; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
-; GFX10-NEXT: v_mul_f32_e32 v17, v20, v19
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_add3_u32 v17, v21, v15, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v20, v6, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v14, v18, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v6
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v13
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
-; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v17, v19, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v20, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v14, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
-; GFX10-NEXT: v_mul_f32_e32 v13, v19, v18
-; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v19, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_mul_f32_e32 v13, v20, v13
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
+; GFX10-NEXT: v_add3_u32 v18, v18, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v18, v20, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_mul_f32_e32 v12, v18, v12
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_mul_f32_e32 v12, v18, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
-; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX10-NEXT: v_bfe_u32 v18, v12, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
-; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
-; GFX10-NEXT: v_mul_f32_e32 v18, v19, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v12
+; GFX10-NEXT: v_add3_u32 v18, v18, v12, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
-; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v22, v3, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v11, v19, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v18, v21, vcc_lo
+; GFX10-NEXT: v_add3_u32 v19, v22, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
-; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
-; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
-; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
-; GFX10-NEXT: v_mul_f32_e32 v19, v22, v20
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_add3_u32 v18, v18, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_add3_u32 v17, v20, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v18, v22, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_mul_f32_e32 v18, v19, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9
-; GFX10-NEXT: v_mul_f32_e32 v9, v22, v20
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8
-; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v8, v23, v18, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_mul_f32_e32 v19, v22, v19
+; GFX10-NEXT: v_bfe_u32 v22, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v23, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
-; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
-; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
+; GFX10-NEXT: v_add3_u32 v18, v22, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX10-NEXT: v_bfe_u32 v9, v19, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v11, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v18, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
+; GFX10-NEXT: v_add3_u32 v9, v9, v19, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
+; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v21, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v20, vcc_lo
; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -14938,156 +16493,170 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mul_f32_e32 v17, v18, v17
-; GFX11-NEXT: v_mul_f32_e32 v6, v6, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-NEXT: v_mul_f32_e32 v7, v7, v15
-; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
-; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v14, 16, v14
; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v21, v15, 16, 1
+; GFX11-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX11-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v16, v17, v20, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: v_add3_u32 v17, v21, v15, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v6, 16, 1
; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
-; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v15, v17, v19, vcc_lo
+; GFX11-NEXT: v_add3_u32 v17, v20, v6, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_mul_f32_e32 v14, v18, v14
+; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v6
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_bfe_u32 v19, v14, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_mul_f32_e32 v5, v5, v13
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_mul_f32 v13, v19, v18
-; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-NEXT: v_add3_u32 v17, v19, v14, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX11-NEXT: v_mul_f32_e32 v13, v20, v13
; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_mul_f32_e32 v12, v18, v12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_add3_u32 v18, v18, v5, 0x7fff
; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v18, v20, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
-; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v18, v19, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_dual_mul_f32 v12, v18, v12 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-NEXT: v_add3_u32 v17, v20, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v12
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
; GFX11-NEXT: v_mul_f32_e32 v3, v3, v11
-; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX11-NEXT: v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v22, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v11, v19, v11 :: v_dual_cndmask_b32 v12, v18, v21
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
+; GFX11-NEXT: v_add3_u32 v19, v22, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v11
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v18, v18, v11, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_lshlrev_b32 v19, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
-; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
-; GFX11-NEXT: v_mul_f32_e32 v19, v22, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
-; GFX11-NEXT: v_dual_mul_f32 v0, v0, v8 :: v_dual_mul_f32 v1, v1, v9
-; GFX11-NEXT: v_mul_f32_e32 v9, v22, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
-; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
-; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v18, v22, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v11, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_add3_u32 v11, v11, v2, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX11-NEXT: v_dual_mul_f32 v0, v0, v8 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v23, v18, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT: v_bfe_u32 v23, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v0
+; GFX11-NEXT: v_mul_f32_e32 v19, v22, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v23, v23, v0, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v18
+; GFX11-NEXT: v_bfe_u32 v22, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_bfe_u32 v9, v19, 16, 1
+; GFX11-NEXT: v_add3_u32 v18, v22, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v9, v9, v19, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v18, v22, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v23, v24, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v11, v21, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v20, vcc_lo
; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <16 x bfloat> %a, %b
@@ -15098,527 +16667,735 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-LABEL: v_fmul_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v29, v29, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
-; GCN-NEXT: v_mul_f32_e32 v31, v31, v32
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v25, v25, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
-; GCN-NEXT: v_mul_f32_e32 v30, v30, v32
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v21, v21, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v17, v17, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v13, v13, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v9, v9, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, v1, v31
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
-; GCN-NEXT: v_mul_f32_e32 v29, v29, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v30, v30, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v28, v28, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v27, v27, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v26, v26, v32
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
-; GCN-NEXT: v_mul_f32_e32 v25, v25, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v24, v24, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v23, v23, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v22, v22, v32
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
-; GCN-NEXT: v_mul_f32_e32 v21, v21, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v20, v20, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v19, v19, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v18, v18, v32
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
-; GCN-NEXT: v_mul_f32_e32 v17, v17, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v16, v16, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v15, v15, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v14, v14, v32
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
-; GCN-NEXT: v_mul_f32_e32 v13, v13, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v12, v12, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v11, v11, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v10, v10, v32
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_mul_f32_e32 v9, v9, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v8, v8, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v7, v7, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v6, v6, v32
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_mul_f32_e32 v5, v5, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v4, v4, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v3, v3, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v2, v2, v32
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GCN-NEXT: v_mul_f32_e32 v0, v0, v32
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GCN-NEXT: v_bfe_u32 v1, v32, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GCN-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GCN-NEXT: v_bfe_u32 v17, v17, 0, 16
+; GCN-NEXT: v_bfe_u32 v21, v21, 0, 16
+; GCN-NEXT: v_bfe_u32 v25, v25, 0, 16
+; GCN-NEXT: v_bfe_u32 v29, v29, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v29, v29, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_bfe_u32 v29, v29, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v25, v25, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_bfe_u32 v25, v25, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_mul_f32_e32 v31, v31, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v30, v30, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v21, v21, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_bfe_u32 v21, v21, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v17, v17, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_bfe_u32 v17, v17, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v13, v13, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v9, v9, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v5, v5, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, v1, v31
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v29, v29, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v26, v26, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v22, v22, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v18, v18, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v14, v14, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v10, v10, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v6, v6, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v2, v2, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v32
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v32, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v32bf16:
@@ -15628,10 +17405,12 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_mul_f32_e32 v31, v32, v31
; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
-; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
; GFX8-NEXT: v_mul_f32_e32 v14, v14, v30
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
@@ -15648,17 +17427,21 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_mul_f32_e32 v32, v32, v30
; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GFX8-NEXT: v_mul_f32_e32 v33, v33, v34
-; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX8-NEXT: v_mul_f32_e32 v30, v15, v30
; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
@@ -15687,9 +17470,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
; GFX8-NEXT: v_mul_f32_e32 v29, v33, v29
; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
-; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
@@ -15705,9 +17490,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX8-NEXT: v_mul_f32_e32 v28, v33, v28
; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
-; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
@@ -15723,9 +17510,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX8-NEXT: v_mul_f32_e32 v27, v33, v27
; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
-; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
@@ -15741,9 +17530,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX8-NEXT: v_mul_f32_e32 v26, v33, v26
; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
-; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
@@ -15759,9 +17550,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX8-NEXT: v_mul_f32_e32 v25, v33, v25
; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
@@ -15777,9 +17570,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX8-NEXT: v_mul_f32_e32 v24, v33, v24
; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
-; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
@@ -15795,9 +17590,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX8-NEXT: v_mul_f32_e32 v23, v33, v23
; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
-; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
@@ -15813,9 +17610,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX8-NEXT: v_mul_f32_e32 v22, v33, v22
; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
@@ -15831,9 +17630,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX8-NEXT: v_mul_f32_e32 v21, v33, v21
; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
@@ -15849,9 +17650,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX8-NEXT: v_mul_f32_e32 v20, v33, v20
; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
@@ -15867,9 +17670,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX8-NEXT: v_mul_f32_e32 v19, v33, v19
; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
-; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
@@ -15885,9 +17690,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v18, v33, v18
; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
@@ -15903,9 +17710,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v17, v33, v17
; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
-; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
@@ -15954,9 +17763,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30
@@ -15971,59 +17782,67 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29
; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29
-; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT: v_mul_f32_e32 v33, v33, v34
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_mul_f32_e32 v32, v32, v33
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29
-; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
+; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v32
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc
+; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v29
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc
+; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1
+; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v13
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_mul_f32_e32 v32, v33, v32
+; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
+; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28
; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28
-; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27
; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
@@ -16037,9 +17856,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27
-; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26
; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
@@ -16053,9 +17874,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26
-; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
@@ -16069,9 +17892,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24
; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
@@ -16085,9 +17910,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23
; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
@@ -16101,9 +17928,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
@@ -16117,9 +17946,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21
; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
@@ -16133,9 +17964,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20
; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
@@ -16149,9 +17982,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19
; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
@@ -16165,9 +18000,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18
; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
@@ -16181,9 +18018,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17
; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
@@ -16197,9 +18036,11 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16
; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
@@ -16234,557 +18075,627 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v21
; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v19
; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_mul_f32_e32 v53, v54, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v17
+; GFX10-NEXT: v_mul_f32_e32 v55, v64, v55
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_mul_f32_e32 v25, v54, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v18
+; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_mul_f32_e32 v65, v66, v65
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v16
+; GFX10-NEXT: v_mul_f32_e32 v68, v68, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_bfe_u32 v21, v55, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v67
+; GFX10-NEXT: v_mul_f32_e32 v34, v36, v34
+; GFX10-NEXT: v_mul_f32_e32 v36, v48, v38
+; GFX10-NEXT: v_mul_f32_e32 v38, v52, v50
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_mul_f32_e32 v48, v64, v54
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v52, v33, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_mul_f32_e32 v24, v64, v55
; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_mul_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_mul_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_bfe_u32 v23, v53, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v55
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_bfe_u32 v19, v65, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_bfe_u32 v17, v68, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX10-NEXT: v_add3_u32 v21, v21, v55, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v33
+; GFX10-NEXT: v_mul_f32_e32 v14, v14, v15
+; GFX10-NEXT: v_bfe_u32 v15, v35, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v22, v30, v22
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v53
; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_mul_f32_e32 v18, v27, v48
-; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_mul_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v65
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_mul_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51
-; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_mul_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
+; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v68
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
+; GFX10-NEXT: v_add3_u32 v33, v52, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v52, v34, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v53, v53
+; GFX10-NEXT: v_add3_u32 v23, v23, v53, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v53, v48, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v65, v65
+; GFX10-NEXT: v_add3_u32 v19, v19, v65, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v68, v68
+; GFX10-NEXT: v_add3_u32 v17, v17, v68, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v20, v21, v20, s10
+; GFX10-NEXT: v_bfe_u32 v21, v8, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_mul_f32_e32 v50, v69, v66
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v35
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v37
+; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_bfe_u32 v28, v39, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_bfe_u32 v27, v49, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v35, v35
+; GFX10-NEXT: v_add3_u32 v15, v15, v35, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v34
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v37, v37
+; GFX10-NEXT: v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v34, v34
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v48, v48
+; GFX10-NEXT: v_add3_u32 v34, v52, v34, 0x7fff
+; GFX10-NEXT: v_add3_u32 v48, v53, v48, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v30, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v19, v18, s11
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v17, v16, s12
+; GFX10-NEXT: v_bfe_u32 v17, v7, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v8, v8
+; GFX10-NEXT: v_add3_u32 v8, v21, v8, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v21, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v39
+; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v49
+; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX10-NEXT: v_bfe_u32 v25, v51, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v39, v39
+; GFX10-NEXT: v_add3_u32 v28, v28, v39, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v49, v49
+; GFX10-NEXT: v_add3_u32 v27, v27, v49, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v38, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v36, v36
+; GFX10-NEXT: v_bfe_u32 v52, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v37, v36, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v34, v34, v35, s13
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
-; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v48, v48, v55, s16
+; GFX10-NEXT: v_bfe_u32 v55, v5, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v7, v7
+; GFX10-NEXT: v_add3_u32 v7, v17, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v1, v1
+; GFX10-NEXT: v_add3_u32 v1, v21, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v51
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v51, v51
+; GFX10-NEXT: v_add3_u32 v25, v25, v51, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v38
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v38, v38
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v14
+; GFX10-NEXT: v_add3_u32 v38, v49, v38, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v36, v36, v39, s14
+; GFX10-NEXT: v_bfe_u32 v39, v22, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v52, v14, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v5, v5
+; GFX10-NEXT: v_add3_u32 v5, v55, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s16
+; GFX10-NEXT: v_bfe_u32 v65, v50, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v38, v38, v51, s15
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v22
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v22, v22
+; GFX10-NEXT: v_add3_u32 v22, v39, v22, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v52, s12
+; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v50
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v50, v50
+; GFX10-NEXT: v_bfe_u32 v49, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v65, v50, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v12, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v67, s6
+; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v19, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v19, v22, v51, s11
+; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v13
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v64, s4
+; GFX10-NEXT: v_bfe_u32 v64, v11, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v66, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v26, v27, v26, s7
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v10
+; GFX10-NEXT: v_cndmask_b32_e64 v24, v25, v24, s8
+; GFX10-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v50, v50, v68, s17
+; GFX10-NEXT: v_bfe_u32 v68, v4, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v49, v13, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v65, v12, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v3, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v67, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v39, v0, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s10
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v11
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v9
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v64, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v9, v9
+; GFX10-NEXT: v_add3_u32 v9, v25, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v0
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v4, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v0, v0
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v2, v2
+; GFX10-NEXT: v_add3_u32 v4, v68, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v65, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v67, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v0, v39, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v37, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v53, s4
+; GFX10-NEXT: v_perm_b32 v7, v7, v20, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v54, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v66, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v27, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v30, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v49, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v55, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v64, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v25, s17
+; GFX10-NEXT: v_perm_b32 v1, v1, v48, 0x7060302
+; GFX10-NEXT: v_perm_b32 v4, v4, v34, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v0, v50, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v36, 0x7060302
+; GFX10-NEXT: v_perm_b32 v2, v2, v38, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT: v_perm_b32 v10, v10, v26, 0x7060302
+; GFX10-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
+; GFX10-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v13, v15, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
-; GFX10-NEXT: v_mul_f32_e32 v17, v31, v17
-; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
-; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_mul_f32_e32 v21, v31, v21
+; GFX10-NEXT: v_mul_f32_e32 v16, v6, v17
+; GFX10-NEXT: v_perm_b32 v6, v19, v18, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v17, v21, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v21
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
+; GFX10-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v16, v16
+; GFX10-NEXT: v_add3_u32 v16, v19, v16, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v20, s4
+; GFX10-NEXT: v_perm_b32 v15, v16, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmul_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_b32 v32, off, s32
-; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
-; GFX11-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
-; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_mul_f32_e32 v2, v2, v18
; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v6
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v3, v3, v19
; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
-; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_mul_f32_e32 v2, v2, v18
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v16
-; GFX11-NEXT: v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
; GFX11-NEXT: v_mul_f32_e32 v7, v7, v23
-; GFX11-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_mul_f32 v18, v84, v83
-; GFX11-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
-; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
-; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
-; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
-; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v20
-; GFX11-NEXT: v_mul_f32_e32 v20, v80, v71
-; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27
+; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT: v_mul_f32_e32 v26, v52, v51
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_mul_f32_e32 v29, v38, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_mul_f32_e32 v14, v14, v30
-; GFX11-NEXT: v_mul_f32_e32 v28, v48, v39
-; GFX11-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
-; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
-; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
-; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
+; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v21
+; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_dual_mul_f32 v33, v34, v33 :: v_dual_mul_f32 v34, v36, v35
+; GFX11-NEXT: v_mul_f32_e32 v35, v38, v37
+; GFX11-NEXT: v_mul_f32_e32 v37, v50, v49
+; GFX11-NEXT: v_dual_mul_f32 v49, v66, v65 :: v_dual_mul_f32 v50, v68, v67
+; GFX11-NEXT: v_dual_mul_f32 v38, v52, v51 :: v_dual_mul_f32 v51, v70, v69
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v23, v49, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_bfe_u32 v27, v37, 16, 1
+; GFX11-NEXT: v_bfe_u32 v21, v51, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_add3_u32 v23, v23, v49, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_add3_u32 v21, v21, v51, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: v_mul_f32_e32 v36, v48, v39
+; GFX11-NEXT: v_or_b32_e32 v69, 0x400000, v37
+; GFX11-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX11-NEXT: v_mul_f32_e32 v48, v64, v55
+; GFX11-NEXT: v_bfe_u32 v64, v33, 16, 1
+; GFX11-NEXT: v_bfe_u32 v28, v36, 16, 1
+; GFX11-NEXT: v_bfe_u32 v26, v38, 16, 1
+; GFX11-NEXT: v_add3_u32 v27, v27, v37, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_or_b32_e32 v65, 0x400000, v33
+; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v36
+; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v38
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX11-NEXT: v_dual_mul_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT: v_bfe_u32 v30, v34, 16, 1
+; GFX11-NEXT: v_add3_u32 v16, v64, v33, 0x7fff
+; GFX11-NEXT: v_add3_u32 v28, v28, v36, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_add3_u32 v26, v26, v38, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
-; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
-; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
-; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v34
+; GFX11-NEXT: v_add3_u32 v30, v30, v34, 0x7fff
+; GFX11-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX11-NEXT: v_bfe_u32 v29, v35, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v65, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v35
+; GFX11-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
+; GFX11-NEXT: v_mul_f32_e32 v52, v80, v71
+; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v66, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-NEXT: v_mul_f32_e32 v39, v54, v53
+; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_bfe_u32 v20, v52, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v29, v29, v67, vcc_lo
+; GFX11-NEXT: v_mul_f32_e32 v54, v84, v83
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-NEXT: v_mul_f32_e32 v55, v86, v85
+; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v52
+; GFX11-NEXT: v_add3_u32 v20, v20, v52, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v18, v54, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v54
+; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v68, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-NEXT: v_add3_u32 v18, v18, v54, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v39
+; GFX11-NEXT: v_bfe_u32 v24, v48, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v27, v27, v69, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX11-NEXT: v_bfe_u32 v25, v39, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v48
+; GFX11-NEXT: v_add3_u32 v24, v24, v48, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v70, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-NEXT: v_add3_u32 v25, v25, v39, 0x7fff
+; GFX11-NEXT: v_mul_f32_e32 v53, v82, v81
+; GFX11-NEXT: v_or_b32_e32 v81, 0x400000, v49
+; GFX11-NEXT: v_bfe_u32 v22, v50, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v50
+; GFX11-NEXT: v_cndmask_b32_e32 v25, v25, v71, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v51
+; GFX11-NEXT: v_add3_u32 v22, v22, v50, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v53, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v85, 0x400000, v53
+; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v80, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-NEXT: v_bfe_u32 v17, v55, 16, 1
+; GFX11-NEXT: v_add3_u32 v19, v19, v53, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v87, 0x400000, v55
+; GFX11-NEXT: v_bfe_u32 v64, v14, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v23, v23, v81, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-NEXT: v_add3_u32 v17, v17, v55, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v14
+; GFX11-NEXT: v_bfe_u32 v97, v13, 16, 1
+; GFX11-NEXT: v_add3_u32 v64, v64, v14, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v82, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v13
+; GFX11-NEXT: v_bfe_u32 v99, v12, 16, 1
+; GFX11-NEXT: v_add3_u32 v34, v97, v13, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v21, v21, v83, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-NEXT: v_bfe_u32 v101, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v35, v99, v12, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v11
+; GFX11-NEXT: v_bfe_u32 v103, v10, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v84, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-NEXT: v_add3_u32 v36, v101, v11, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v10
+; GFX11-NEXT: v_bfe_u32 v113, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v37, v103, v10, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v85, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v115, v8, 16, 1
+; GFX11-NEXT: v_add3_u32 v38, v113, v9, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v117, v7, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v18, v18, v86, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-NEXT: v_add3_u32 v39, v115, v8, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v7
+; GFX11-NEXT: v_bfe_u32 v119, v6, 16, 1
+; GFX11-NEXT: v_add3_u32 v48, v117, v7, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v87, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
-; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
-; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
-; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
-; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v129, v5, 16, 1
+; GFX11-NEXT: v_add3_u32 v49, v119, v6, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v5
+; GFX11-NEXT: v_bfe_u32 v131, v4, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v64, v96, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
-; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
-; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
+; GFX11-NEXT: v_add3_u32 v50, v129, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v133, v3, 16, 1
+; GFX11-NEXT: v_add3_u32 v51, v131, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v13, v34, v98, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
-; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
-; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
-; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v52, v133, v3, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v145, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v12, v35, v100, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
-; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
-; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
+; GFX11-NEXT: v_add3_u32 v55, v147, v0, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v54, v145, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v135, v2, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v11, v36, v102, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
-; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
-; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
+; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT: v_add3_u32 v53, v135, v2, 0x7fff
+; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v37, v112, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
-; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
+; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v38, v114, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v39, v116, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v48, v118, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
+; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v49, v128, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v50, v130, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v51, v132, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v52, v134, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v55, v33, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v32
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v54, v146, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
-; GFX11-NEXT: v_mul_f32_e32 v15, v15, v18
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v33
+; GFX11-NEXT: v_dual_mul_f32 v17, v31, v17 :: v_dual_cndmask_b32 v2, v53, v144
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v15, v15, v18
; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <32 x bfloat> %a, %b
@@ -16797,8 +18708,10 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GCN-NEXT: v_rcp_f32_e32 v3, v2
; GCN-NEXT: v_fma_f32 v4, -v2, v3, 1.0
@@ -16810,7 +18723,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_fma_f32 v2, -v2, v5, v4
; GCN-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GCN-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fdiv_bf16:
@@ -16818,8 +18732,10 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0
@@ -16831,7 +18747,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_bf16:
@@ -16950,18 +18867,22 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fabs_bf16:
@@ -17145,22 +19066,28 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, -1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fneg_fabs_bf16:
@@ -17261,10 +19188,15 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_bf16:
@@ -17272,10 +19204,15 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_bf16:
@@ -17350,31 +19287,53 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v1, v1, v3
; GCN-NEXT: v_min_f32_e32 v0, v0, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v2bf16:
@@ -17384,9 +19343,11 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_min_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
@@ -17407,9 +19368,11 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_min_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
@@ -17429,49 +19392,53 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX10-LABEL: v_minnum_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_min_f32_e32 v2, v3, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_min_f32_e32 v1, v3, v2
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_minnum_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_min_f32_e32 v2, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v3, v2
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
ret <2 x bfloat> %op
@@ -17483,45 +19450,77 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_min_f32_e32 v2, v2, v5
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v1, v1, v4
+; GCN-NEXT: v_min_f32_e32 v2, v2, v5
; GCN-NEXT: v_min_f32_e32 v0, v0, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v3bf16:
@@ -17540,10 +19539,12 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_min_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
@@ -17574,9 +19575,11 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
@@ -17596,31 +19599,33 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX10-LABEL: v_minnum_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX10-NEXT: v_min_f32_e32 v2, v5, v4
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
@@ -17633,57 +19638,99 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_min_f32_e32 v1, v1, v5
; GCN-NEXT: v_min_f32_e32 v3, v3, v7
; GCN-NEXT: v_min_f32_e32 v2, v2, v6
-; GCN-NEXT: v_min_f32_e32 v1, v1, v5
; GCN-NEXT: v_min_f32_e32 v0, v0, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v4bf16:
@@ -17693,9 +19740,11 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_min_f32_e32 v4, v5, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
@@ -17712,9 +19761,11 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
@@ -17737,9 +19788,11 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
@@ -17754,9 +19807,11 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
@@ -17778,81 +19833,88 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_min_f32_e32 v3, v7, v6
-; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
+; GFX10-NEXT: v_min_f32_e32 v2, v5, v6
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_minnum_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX11-NEXT: v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4
-; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v2, v5, v6
+; GFX11-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX11-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
ret <4 x bfloat> %op
@@ -17864,105 +19926,189 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_min_f32_e32 v5, v5, v13
+; GCN-NEXT: v_min_f32_e32 v1, v1, v9
; GCN-NEXT: v_min_f32_e32 v7, v7, v15
; GCN-NEXT: v_min_f32_e32 v6, v6, v14
-; GCN-NEXT: v_min_f32_e32 v5, v5, v13
; GCN-NEXT: v_min_f32_e32 v4, v4, v12
; GCN-NEXT: v_min_f32_e32 v3, v3, v11
; GCN-NEXT: v_min_f32_e32 v2, v2, v10
-; GCN-NEXT: v_min_f32_e32 v1, v1, v9
; GCN-NEXT: v_min_f32_e32 v0, v0, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v15
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_min_f32_e32 v5, v5, v13
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v12
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v11
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v10
-; GFX7-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v9
; GFX7-NEXT: v_min_f32_e32 v0, v0, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v8bf16:
@@ -17972,9 +20118,11 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX8-NEXT: v_min_f32_e32 v8, v9, v8
; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_min_f32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
@@ -17991,9 +20139,11 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX8-NEXT: v_min_f32_e32 v7, v9, v7
; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_min_f32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
@@ -18009,9 +20159,11 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX8-NEXT: v_min_f32_e32 v6, v9, v6
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_min_f32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
@@ -18027,9 +20179,11 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX8-NEXT: v_min_f32_e32 v5, v9, v5
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_min_f32_e32 v0, v0, v4
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
@@ -18056,9 +20210,11 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v3, v3, v7
@@ -18073,9 +20229,11 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_min_f32_e32 v7, v9, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_min_f32_e32 v2, v2, v6
; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
@@ -18089,9 +20247,11 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_min_f32_e32 v6, v9, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v5
; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
@@ -18105,9 +20265,11 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_min_f32_e32 v5, v9, v5
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v4
; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
@@ -18131,155 +20293,171 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_min_f32_e32 v8, v9, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
-; GFX10-NEXT: v_min_f32_e32 v7, v10, v9
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_min_f32_e32 v9, v10, v9
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_add3_u32 v7, v11, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_min_f32_e32 v6, v10, v6
-; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX10-NEXT: v_add3_u32 v8, v11, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v12, v9, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_min_f32_e32 v9, v12, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v14, v2, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
-; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_min_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_min_f32_e32 v5, v15, v13
-; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX10-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_add3_u32 v5, v12, v9, 0x7fff
+; GFX10-NEXT: v_min_f32_e32 v4, v15, v14
; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v9
; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v12, v0, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_bfe_u32 v14, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
-; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
-; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
+; GFX10-NEXT: v_add3_u32 v9, v11, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v12, v0, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
+; GFX10-NEXT: v_add3_u32 v12, v14, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v4
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v6, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v12, v14, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v1, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_minnum_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v7, 16, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-NEXT: v_min_f32_e32 v3, v3, v7
-; GFX11-NEXT: v_min_f32_e32 v7, v10, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
-; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT: v_min_f32_e32 v9, v10, v9
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-NEXT: v_add3_u32 v7, v11, v8, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
-; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_min_f32 v2, v2, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_min_f32_e32 v6, v10, v6
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v8, v11, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v10, v12, v9, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_min_f32 v1, v1, v5 :: v_dual_min_f32 v2, v2, v6
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v10, v14, v2, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11-NEXT: v_min_f32_e32 v0, v0, v4
-; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT: v_dual_min_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
-; GFX11-NEXT: v_min_f32_e32 v5, v15, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v9, v12, v11
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_min_f32_e32 v4, v15, v14
; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v5, v12, v9, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v12, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v14, v4, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v11, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v10, v12, v0, 0x7fff
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
+; GFX11-NEXT: v_add3_u32 v12, v14, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v4
+; GFX11-NEXT: v_perm_b32 v2, v2, v6, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v14, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v1, v5, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
ret <8 x bfloat> %op
@@ -18289,224 +20467,394 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-LABEL: v_minnum_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_min_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_min_f32_e32 v13, v13, v29
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_min_f32_e32 v12, v12, v28
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_min_f32_e32 v11, v11, v27
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_min_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_min_f32_e32 v9, v9, v25
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_min_f32_e32 v8, v8, v24
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_min_f32_e32 v7, v7, v23
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_min_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_min_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v1, v1, v17
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_min_f32_e32 v14, v14, v17
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_min_f32_e32 v12, v12, v17
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_min_f32_e32 v11, v11, v17
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_min_f32_e32 v10, v10, v17
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_min_f32_e32 v8, v8, v17
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_min_f32_e32 v7, v7, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_min_f32_e32 v4, v4, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_min_f32_e32 v6, v6, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_min_f32_e32 v3, v3, v19
-; GCN-NEXT: v_min_f32_e32 v2, v2, v18
-; GCN-NEXT: v_min_f32_e32 v1, v1, v17
-; GCN-NEXT: v_min_f32_e32 v0, v0, v16
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_min_f32_e32 v4, v4, v19
+; GCN-NEXT: v_min_f32_e32 v3, v3, v18
+; GCN-NEXT: v_min_f32_e32 v2, v2, v17
+; GCN-NEXT: v_min_f32_e32 v0, v0, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GCN-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_min_f32_e32 v15, v15, v16
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_min_f32_e32 v29, v13, v29
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_min_f32_e32 v25, v9, v13
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_min_f32_e32 v31, v5, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v17
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_min_f32_e32 v32, v1, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v26
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
-; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
-; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
-; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v21
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v20
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v19
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX7-NEXT: v_min_f32_e32 v1, v1, v17
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v16bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX8-NEXT: v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT: v_min_f32_e32 v7, v7, v15
-; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v21
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v28
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v27
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_min_f32_e32 v14, v14, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v32
+; GFX7-NEXT: v_min_f32_e32 v12, v12, v25
+; GFX7-NEXT: v_min_f32_e32 v10, v10, v23
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v22
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v16
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v13
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v20, 0, 16
+; GFX7-NEXT: v_bfe_u32 v5, v19, 0, 16
+; GFX7-NEXT: v_bfe_u32 v9, v18, 0, 16
+; GFX7-NEXT: v_bfe_u32 v13, v17, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
+; GFX8-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
@@ -18518,9 +20866,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX8-NEXT: v_min_f32_e32 v15, v17, v15
; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v6, v6, v14
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
@@ -18536,9 +20886,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX8-NEXT: v_min_f32_e32 v14, v17, v14
; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v5, v5, v13
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
@@ -18554,9 +20906,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX8-NEXT: v_min_f32_e32 v13, v17, v13
; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
-; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v4, v4, v12
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
@@ -18572,9 +20926,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX8-NEXT: v_min_f32_e32 v12, v17, v12
; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v3, v3, v11
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
@@ -18590,9 +20946,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX8-NEXT: v_min_f32_e32 v11, v17, v11
; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v2, v2, v10
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
@@ -18608,9 +20966,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX8-NEXT: v_min_f32_e32 v10, v17, v10
; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v1, v1, v9
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
@@ -18626,9 +20986,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX8-NEXT: v_min_f32_e32 v9, v17, v9
; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v0, v0, v8
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
@@ -18663,9 +21025,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v7, v7, v15
@@ -18680,9 +21044,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_min_f32_e32 v15, v17, v15
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_min_f32_e32 v6, v6, v14
; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
@@ -18696,9 +21062,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_min_f32_e32 v14, v17, v14
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_min_f32_e32 v5, v5, v13
; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
@@ -18712,9 +21080,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_min_f32_e32 v13, v17, v13
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_min_f32_e32 v4, v4, v12
; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
@@ -18728,9 +21098,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_min_f32_e32 v12, v17, v12
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_min_f32_e32 v3, v3, v11
; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
@@ -18744,9 +21116,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_min_f32_e32 v11, v17, v11
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_min_f32_e32 v2, v2, v10
; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
@@ -18760,9 +21134,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_min_f32_e32 v10, v17, v10
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v9
; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
@@ -18776,9 +21152,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_min_f32_e32 v9, v17, v9
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v8
; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
@@ -18806,139 +21184,155 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_min_f32_e32 v16, v17, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
-; GFX10-NEXT: v_min_f32_e32 v7, v7, v15
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
-; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX10-NEXT: v_min_f32_e32 v17, v18, v17
-; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
-; GFX10-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v15, v18, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v20, vcc_lo
; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
-; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
-; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT: v_bfe_u32 v21, v15, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
-; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
-; GFX10-NEXT: v_min_f32_e32 v17, v20, v19
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_add3_u32 v17, v21, v15, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v20, v6, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v14, v18, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v6
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
; GFX10-NEXT: v_min_f32_e32 v5, v5, v13
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
-; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v17, v19, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v20, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v14, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
-; GFX10-NEXT: v_min_f32_e32 v13, v19, v18
-; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v19, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_min_f32_e32 v13, v20, v13
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
+; GFX10-NEXT: v_add3_u32 v18, v18, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX10-NEXT: v_min_f32_e32 v4, v4, v12
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v18, v20, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_min_f32_e32 v12, v18, v12
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_min_f32_e32 v12, v18, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v11
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX10-NEXT: v_bfe_u32 v18, v12, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
-; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
-; GFX10-NEXT: v_min_f32_e32 v18, v19, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v12
+; GFX10-NEXT: v_add3_u32 v18, v18, v12, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v10
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v22, v3, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v11, v19, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v18, v21, vcc_lo
+; GFX10-NEXT: v_add3_u32 v19, v22, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
-; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
-; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
-; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
-; GFX10-NEXT: v_min_f32_e32 v19, v22, v20
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_add3_u32 v18, v18, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_add3_u32 v17, v20, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v18, v22, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_min_f32_e32 v18, v19, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX10-NEXT: v_min_f32_e32 v1, v1, v9
-; GFX10-NEXT: v_min_f32_e32 v9, v22, v20
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX10-NEXT: v_min_f32_e32 v0, v0, v8
-; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v8, v23, v18, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_min_f32_e32 v19, v22, v19
+; GFX10-NEXT: v_bfe_u32 v22, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v23, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
-; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
-; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
+; GFX10-NEXT: v_add3_u32 v18, v22, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX10-NEXT: v_bfe_u32 v9, v19, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v11, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v18, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
+; GFX10-NEXT: v_add3_u32 v9, v9, v19, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
+; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v21, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v20, vcc_lo
; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -18946,156 +21340,170 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_min_f32_e32 v17, v18, v17
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-NEXT: v_min_f32_e32 v7, v7, v15
-; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
-; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v14, 16, v14
; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v21, v15, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX11-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v16, v17, v20, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: v_add3_u32 v17, v21, v15, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v6, 16, 1
; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_min_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
-; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v15, v17, v19, vcc_lo
+; GFX11-NEXT: v_add3_u32 v17, v20, v6, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_min_f32_e32 v14, v18, v14
+; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v6
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v4, v4, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_bfe_u32 v19, v14, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_min_f32_e32 v5, v5, v13
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_min_f32 v13, v19, v18
-; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-NEXT: v_add3_u32 v17, v19, v14, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v13, v20, v13
; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_min_f32_e32 v12, v18, v12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_add3_u32 v18, v18, v5, 0x7fff
; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; GFX11-NEXT: v_min_f32_e32 v4, v4, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v18, v20, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
-; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v18, v19, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_dual_min_f32 v12, v18, v12 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-NEXT: v_add3_u32 v17, v20, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v12
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
; GFX11-NEXT: v_min_f32_e32 v3, v3, v11
-; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX11-NEXT: v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v22, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v11, v19, v11 :: v_dual_cndmask_b32 v12, v18, v21
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
+; GFX11-NEXT: v_add3_u32 v19, v22, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v11
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v18, v18, v11, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_lshlrev_b32 v19, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
-; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
-; GFX11-NEXT: v_min_f32_e32 v19, v22, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
-; GFX11-NEXT: v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v1, v1, v9
-; GFX11-NEXT: v_min_f32_e32 v9, v22, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
-; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
-; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v18, v22, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v11, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_add3_u32 v11, v11, v2, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX11-NEXT: v_dual_min_f32 v0, v0, v8 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v23, v18, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT: v_bfe_u32 v23, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v0
+; GFX11-NEXT: v_min_f32_e32 v19, v22, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v23, v23, v0, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v18
+; GFX11-NEXT: v_bfe_u32 v22, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_bfe_u32 v9, v19, 16, 1
+; GFX11-NEXT: v_add3_u32 v18, v22, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v9, v9, v19, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v18, v22, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v23, v24, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v11, v21, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v20, vcc_lo
; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
@@ -19106,527 +21514,863 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-LABEL: v_minnum_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_min_f32_e32 v29, v29, v31
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_min_f32_e32 v25, v25, v31
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_min_f32_e32 v21, v21, v31
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_min_f32_e32 v17, v17, v31
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_min_f32_e32 v13, v13, v31
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_min_f32_e32 v9, v9, v31
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_min_f32_e32 v5, v5, v31
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_min_f32_e32 v31, v1, v31
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v32
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
-; GCN-NEXT: v_min_f32_e32 v31, v31, v32
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v1, v1, v32
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v30, v30, v32
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
-; GCN-NEXT: v_min_f32_e32 v29, v29, v32
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v28, v28, v32
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v27, v27, v32
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v26, v26, v32
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
-; GCN-NEXT: v_min_f32_e32 v25, v25, v32
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v24, v24, v32
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v23, v23, v32
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v22, v22, v32
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
-; GCN-NEXT: v_min_f32_e32 v21, v21, v32
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v20, v20, v32
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v19, v19, v32
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v18, v18, v32
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
-; GCN-NEXT: v_min_f32_e32 v17, v17, v32
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v16, v16, v32
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v15, v15, v32
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v14, v14, v32
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
-; GCN-NEXT: v_min_f32_e32 v13, v13, v32
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v12, v12, v32
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v11, v11, v32
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v10, v10, v32
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_min_f32_e32 v9, v9, v32
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v8, v8, v32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v7, v7, v32
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v6, v6, v32
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_min_f32_e32 v5, v5, v32
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v4, v4, v32
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v3, v3, v32
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v2, v2, v32
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_min_f32_e32 v1, v1, v32
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v0, v0, v32
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GCN-NEXT: v_bfe_u32 v1, v32, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GCN-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GCN-NEXT: v_bfe_u32 v17, v17, 0, 16
+; GCN-NEXT: v_bfe_u32 v21, v21, 0, 16
+; GCN-NEXT: v_bfe_u32 v25, v25, 0, 16
+; GCN-NEXT: v_bfe_u32 v29, v29, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_min_f32_e32 v29, v29, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_bfe_u32 v29, v29, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_min_f32_e32 v25, v25, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_bfe_u32 v25, v25, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_min_f32_e32 v21, v21, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_bfe_u32 v21, v21, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_min_f32_e32 v17, v17, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_bfe_u32 v17, v17, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_min_f32_e32 v13, v13, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_min_f32_e32 v31, v1, v31
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v30, v30, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v29, v29, v32
+; GFX7-NEXT: v_min_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v26, v26, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v25, v25, v32
+; GFX7-NEXT: v_min_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v22, v22, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v21, v21, v32
+; GFX7-NEXT: v_min_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v18, v18, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v17, v17, v32
+; GFX7-NEXT: v_min_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v14, v14, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v13, v13, v32
+; GFX7-NEXT: v_min_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v10, v10, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v9, v9, v32
+; GFX7-NEXT: v_min_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v32
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_min_f32_e32 v1, v1, v32
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v0, v0, v32
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v32, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v32bf16:
@@ -19636,10 +22380,12 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_min_f32_e32 v31, v32, v31
; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
-; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
; GFX8-NEXT: v_min_f32_e32 v14, v14, v30
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
@@ -19656,17 +22402,21 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_min_f32_e32 v32, v32, v30
; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_min_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GFX8-NEXT: v_min_f32_e32 v33, v33, v34
-; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX8-NEXT: v_min_f32_e32 v30, v15, v30
; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
@@ -19695,9 +22445,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
; GFX8-NEXT: v_min_f32_e32 v29, v33, v29
; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
-; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v12, v12, v28
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
@@ -19713,9 +22465,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX8-NEXT: v_min_f32_e32 v28, v33, v28
; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
-; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v11, v11, v27
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
@@ -19731,9 +22485,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX8-NEXT: v_min_f32_e32 v27, v33, v27
; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
-; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v10, v10, v26
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
@@ -19749,9 +22505,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX8-NEXT: v_min_f32_e32 v26, v33, v26
; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
-; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v9, v9, v25
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
@@ -19767,9 +22525,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX8-NEXT: v_min_f32_e32 v25, v33, v25
; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v8, v8, v24
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
@@ -19785,9 +22545,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX8-NEXT: v_min_f32_e32 v24, v33, v24
; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
-; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v7, v7, v23
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
@@ -19803,9 +22565,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX8-NEXT: v_min_f32_e32 v23, v33, v23
; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
-; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v6, v6, v22
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
@@ -19821,9 +22585,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX8-NEXT: v_min_f32_e32 v22, v33, v22
; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v5, v5, v21
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
@@ -19839,9 +22605,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX8-NEXT: v_min_f32_e32 v21, v33, v21
; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v4, v4, v20
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
@@ -19857,9 +22625,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX8-NEXT: v_min_f32_e32 v20, v33, v20
; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v3, v3, v19
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
@@ -19875,9 +22645,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX8-NEXT: v_min_f32_e32 v19, v33, v19
; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
-; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v2, v2, v18
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
@@ -19893,9 +22665,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX8-NEXT: v_min_f32_e32 v18, v33, v18
; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v1, v1, v17
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
@@ -19911,9 +22685,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX8-NEXT: v_min_f32_e32 v17, v33, v17
; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
-; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v0, v0, v16
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
@@ -19962,9 +22738,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX9-NEXT: v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX9-NEXT: v_min_f32_e32 v14, v14, v30
@@ -19979,59 +22757,67 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
; GFX9-NEXT: v_min_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v13, v13, v29
; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT: v_min_f32_e32 v32, v32, v29
-; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT: v_min_f32_e32 v33, v33, v34
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_min_f32_e32 v32, v32, v33
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX9-NEXT: v_min_f32_e32 v29, v15, v29
-; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
+; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v32
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc
+; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v29
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc
+; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1
+; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v13
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_min_f32_e32 v32, v33, v32
+; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
+; GFX9-NEXT: v_min_f32_e32 v12, v12, v28
; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX9-NEXT: v_min_f32_e32 v28, v33, v28
-; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX9-NEXT: v_min_f32_e32 v11, v11, v27
; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
@@ -20045,9 +22831,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX9-NEXT: v_min_f32_e32 v27, v33, v27
-; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX9-NEXT: v_min_f32_e32 v10, v10, v26
; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
@@ -20061,9 +22849,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX9-NEXT: v_min_f32_e32 v26, v33, v26
-; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX9-NEXT: v_min_f32_e32 v9, v9, v25
; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
@@ -20077,9 +22867,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_min_f32_e32 v25, v33, v25
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX9-NEXT: v_min_f32_e32 v8, v8, v24
; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
@@ -20093,9 +22885,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_min_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX9-NEXT: v_min_f32_e32 v7, v7, v23
; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
@@ -20109,9 +22903,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_min_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX9-NEXT: v_min_f32_e32 v6, v6, v22
; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
@@ -20125,9 +22921,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_min_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX9-NEXT: v_min_f32_e32 v5, v5, v21
; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
@@ -20141,9 +22939,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_min_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX9-NEXT: v_min_f32_e32 v4, v4, v20
; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
@@ -20157,9 +22957,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_min_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX9-NEXT: v_min_f32_e32 v3, v3, v19
; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
@@ -20173,9 +22975,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_min_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX9-NEXT: v_min_f32_e32 v2, v2, v18
; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
@@ -20189,9 +22993,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_min_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v17
; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
@@ -20205,9 +23011,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_min_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v16
; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
@@ -20242,557 +23050,627 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v21
; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v19
; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_min_f32_e32 v53, v54, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v17
+; GFX10-NEXT: v_min_f32_e32 v55, v64, v55
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX10-NEXT: v_min_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_min_f32_e32 v25, v54, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v18
+; GFX10-NEXT: v_min_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_min_f32_e32 v65, v66, v65
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v16
+; GFX10-NEXT: v_min_f32_e32 v68, v68, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_bfe_u32 v21, v55, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v67
+; GFX10-NEXT: v_min_f32_e32 v34, v36, v34
+; GFX10-NEXT: v_min_f32_e32 v36, v48, v38
+; GFX10-NEXT: v_min_f32_e32 v38, v52, v50
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_min_f32_e32 v48, v64, v54
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v52, v33, 16, 1
; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_min_f32_e32 v24, v64, v55
; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_min_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_min_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_bfe_u32 v23, v53, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v55
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_bfe_u32 v19, v65, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_bfe_u32 v17, v68, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX10-NEXT: v_add3_u32 v21, v21, v55, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v33
+; GFX10-NEXT: v_min_f32_e32 v14, v14, v15
+; GFX10-NEXT: v_bfe_u32 v15, v35, 16, 1
; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v22, v30, v22
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v53
; GFX10-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_min_f32_e32 v18, v27, v48
-; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_min_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v65
; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_min_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_min_f32_e32 v51, v52, v51
-; GFX10-NEXT: v_min_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_min_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_min_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
+; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v68
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
+; GFX10-NEXT: v_add3_u32 v33, v52, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v52, v34, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v53, v53
+; GFX10-NEXT: v_add3_u32 v23, v23, v53, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v53, v48, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v65, v65
+; GFX10-NEXT: v_add3_u32 v19, v19, v65, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v68, v68
+; GFX10-NEXT: v_add3_u32 v17, v17, v68, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v20, v21, v20, s10
+; GFX10-NEXT: v_bfe_u32 v21, v8, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_min_f32_e32 v50, v69, v66
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v35
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v37
+; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_bfe_u32 v28, v39, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_bfe_u32 v27, v49, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v35, v35
+; GFX10-NEXT: v_add3_u32 v15, v15, v35, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v34
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v37, v37
+; GFX10-NEXT: v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v34, v34
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v48, v48
+; GFX10-NEXT: v_add3_u32 v34, v52, v34, 0x7fff
+; GFX10-NEXT: v_add3_u32 v48, v53, v48, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v30, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v19, v18, s11
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v17, v16, s12
+; GFX10-NEXT: v_bfe_u32 v17, v7, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v8, v8
+; GFX10-NEXT: v_add3_u32 v8, v21, v8, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v21, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v39
+; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v49
+; GFX10-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX10-NEXT: v_bfe_u32 v25, v51, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v39, v39
+; GFX10-NEXT: v_add3_u32 v28, v28, v39, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v49, v49
+; GFX10-NEXT: v_add3_u32 v27, v27, v49, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v38, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v36, v36
+; GFX10-NEXT: v_bfe_u32 v52, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v37, v36, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v34, v34, v35, s13
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
-; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v48, v48, v55, s16
+; GFX10-NEXT: v_bfe_u32 v55, v5, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v7, v7
+; GFX10-NEXT: v_add3_u32 v7, v17, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v1, v1
+; GFX10-NEXT: v_add3_u32 v1, v21, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v51
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v51, v51
+; GFX10-NEXT: v_add3_u32 v25, v25, v51, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v38
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v38, v38
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v14
+; GFX10-NEXT: v_add3_u32 v38, v49, v38, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v36, v36, v39, s14
+; GFX10-NEXT: v_bfe_u32 v39, v22, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v52, v14, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v5, v5
+; GFX10-NEXT: v_add3_u32 v5, v55, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s16
+; GFX10-NEXT: v_bfe_u32 v65, v50, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v38, v38, v51, s15
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v22
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v22, v22
+; GFX10-NEXT: v_add3_u32 v22, v39, v22, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v52, s12
+; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v50
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v50, v50
+; GFX10-NEXT: v_bfe_u32 v49, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v65, v50, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v12, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v67, s6
+; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v19, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v19, v22, v51, s11
+; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v13
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v64, s4
+; GFX10-NEXT: v_bfe_u32 v64, v11, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v66, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v26, v27, v26, s7
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v10
+; GFX10-NEXT: v_cndmask_b32_e64 v24, v25, v24, s8
+; GFX10-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v50, v50, v68, s17
+; GFX10-NEXT: v_bfe_u32 v68, v4, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v49, v13, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v65, v12, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v3, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v67, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v39, v0, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s10
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v11
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v9
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v64, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v9, v9
+; GFX10-NEXT: v_add3_u32 v9, v25, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v0
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v4, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v0, v0
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v2, v2
+; GFX10-NEXT: v_add3_u32 v4, v68, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v65, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v67, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v0, v39, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v37, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v53, s4
+; GFX10-NEXT: v_perm_b32 v7, v7, v20, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v54, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v66, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v27, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v30, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v49, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v55, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v64, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v25, s17
+; GFX10-NEXT: v_perm_b32 v1, v1, v48, 0x7060302
+; GFX10-NEXT: v_perm_b32 v4, v4, v34, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v0, v50, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v36, 0x7060302
+; GFX10-NEXT: v_perm_b32 v2, v2, v38, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT: v_perm_b32 v10, v10, v26, 0x7060302
+; GFX10-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
+; GFX10-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v13, v15, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
-; GFX10-NEXT: v_min_f32_e32 v17, v31, v17
-; GFX10-NEXT: v_min_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
-; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_min_f32_e32 v21, v31, v21
+; GFX10-NEXT: v_min_f32_e32 v16, v6, v17
+; GFX10-NEXT: v_perm_b32 v6, v19, v18, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v17, v21, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v21
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
+; GFX10-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v16, v16
+; GFX10-NEXT: v_add3_u32 v16, v19, v16, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v20, s4
+; GFX10-NEXT: v_perm_b32 v15, v16, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_minnum_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_b32 v32, off, s32
-; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
-; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
-; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v18
; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v6
+; GFX11-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_min_f32 v3, v3, v19
; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
-; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX11-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX11-NEXT: v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
; GFX11-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX11-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_min_f32 v18, v84, v83
-; GFX11-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
-; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
-; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
-; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
-; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_min_f32_e32 v4, v4, v20
-; GFX11-NEXT: v_min_f32_e32 v20, v80, v71
-; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27
+; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT: v_min_f32_e32 v26, v52, v51
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_min_f32_e32 v29, v38, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_min_f32_e32 v14, v14, v30
-; GFX11-NEXT: v_min_f32_e32 v28, v48, v39
-; GFX11-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
-; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
-; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
-; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
+; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_dual_min_f32 v33, v34, v33 :: v_dual_min_f32 v34, v36, v35
+; GFX11-NEXT: v_min_f32_e32 v35, v38, v37
+; GFX11-NEXT: v_min_f32_e32 v37, v50, v49
+; GFX11-NEXT: v_dual_min_f32 v49, v66, v65 :: v_dual_min_f32 v50, v68, v67
+; GFX11-NEXT: v_dual_min_f32 v38, v52, v51 :: v_dual_min_f32 v51, v70, v69
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v23, v49, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_bfe_u32 v27, v37, 16, 1
+; GFX11-NEXT: v_bfe_u32 v21, v51, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_add3_u32 v23, v23, v49, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_add3_u32 v21, v21, v51, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: v_min_f32_e32 v36, v48, v39
+; GFX11-NEXT: v_or_b32_e32 v69, 0x400000, v37
+; GFX11-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX11-NEXT: v_min_f32_e32 v48, v64, v55
+; GFX11-NEXT: v_bfe_u32 v64, v33, 16, 1
+; GFX11-NEXT: v_bfe_u32 v28, v36, 16, 1
+; GFX11-NEXT: v_bfe_u32 v26, v38, 16, 1
+; GFX11-NEXT: v_add3_u32 v27, v27, v37, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_or_b32_e32 v65, 0x400000, v33
+; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v36
+; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v38
+; GFX11-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX11-NEXT: v_dual_min_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT: v_bfe_u32 v30, v34, 16, 1
+; GFX11-NEXT: v_add3_u32 v16, v64, v33, 0x7fff
+; GFX11-NEXT: v_add3_u32 v28, v28, v36, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_add3_u32 v26, v26, v38, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
-; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
-; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
-; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v34
+; GFX11-NEXT: v_add3_u32 v30, v30, v34, 0x7fff
+; GFX11-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX11-NEXT: v_bfe_u32 v29, v35, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v65, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v35
+; GFX11-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
+; GFX11-NEXT: v_min_f32_e32 v52, v80, v71
+; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v66, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-NEXT: v_min_f32_e32 v39, v54, v53
+; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_bfe_u32 v20, v52, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v29, v29, v67, vcc_lo
+; GFX11-NEXT: v_min_f32_e32 v54, v84, v83
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-NEXT: v_min_f32_e32 v55, v86, v85
+; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v52
+; GFX11-NEXT: v_add3_u32 v20, v20, v52, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v18, v54, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v54
+; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v68, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-NEXT: v_add3_u32 v18, v18, v54, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v39
+; GFX11-NEXT: v_bfe_u32 v24, v48, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v27, v27, v69, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX11-NEXT: v_bfe_u32 v25, v39, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v48
+; GFX11-NEXT: v_add3_u32 v24, v24, v48, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v70, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-NEXT: v_add3_u32 v25, v25, v39, 0x7fff
+; GFX11-NEXT: v_min_f32_e32 v53, v82, v81
+; GFX11-NEXT: v_or_b32_e32 v81, 0x400000, v49
+; GFX11-NEXT: v_bfe_u32 v22, v50, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v50
+; GFX11-NEXT: v_cndmask_b32_e32 v25, v25, v71, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v51
+; GFX11-NEXT: v_add3_u32 v22, v22, v50, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v53, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v85, 0x400000, v53
+; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v80, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-NEXT: v_bfe_u32 v17, v55, 16, 1
+; GFX11-NEXT: v_add3_u32 v19, v19, v53, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v87, 0x400000, v55
+; GFX11-NEXT: v_bfe_u32 v64, v14, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v23, v23, v81, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-NEXT: v_add3_u32 v17, v17, v55, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v14
+; GFX11-NEXT: v_bfe_u32 v97, v13, 16, 1
+; GFX11-NEXT: v_add3_u32 v64, v64, v14, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v82, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v13
+; GFX11-NEXT: v_bfe_u32 v99, v12, 16, 1
+; GFX11-NEXT: v_add3_u32 v34, v97, v13, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v21, v21, v83, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-NEXT: v_bfe_u32 v101, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v35, v99, v12, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v11
+; GFX11-NEXT: v_bfe_u32 v103, v10, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v84, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-NEXT: v_add3_u32 v36, v101, v11, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v10
+; GFX11-NEXT: v_bfe_u32 v113, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v37, v103, v10, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v85, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v115, v8, 16, 1
+; GFX11-NEXT: v_add3_u32 v38, v113, v9, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v117, v7, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v18, v18, v86, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-NEXT: v_add3_u32 v39, v115, v8, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v7
+; GFX11-NEXT: v_bfe_u32 v119, v6, 16, 1
+; GFX11-NEXT: v_add3_u32 v48, v117, v7, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v87, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
-; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
-; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
-; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
-; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v129, v5, 16, 1
+; GFX11-NEXT: v_add3_u32 v49, v119, v6, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v5
+; GFX11-NEXT: v_bfe_u32 v131, v4, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v64, v96, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
-; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
-; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
+; GFX11-NEXT: v_add3_u32 v50, v129, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v133, v3, 16, 1
+; GFX11-NEXT: v_add3_u32 v51, v131, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v13, v34, v98, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
-; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
-; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
-; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v52, v133, v3, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v145, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v12, v35, v100, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
-; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
-; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
+; GFX11-NEXT: v_add3_u32 v55, v147, v0, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v54, v145, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v135, v2, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v11, v36, v102, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
-; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
-; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
+; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT: v_add3_u32 v53, v135, v2, 0x7fff
+; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v37, v112, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
-; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
+; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v38, v114, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v39, v116, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v48, v118, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
+; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v49, v128, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v50, v130, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v51, v132, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v52, v134, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v55, v33, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v32
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v54, v146, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_min_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
-; GFX11-NEXT: v_min_f32_e32 v15, v15, v18
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v33
+; GFX11-NEXT: v_dual_min_f32 v17, v31, v17 :: v_dual_cndmask_b32 v2, v53, v144
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v15, v15, v18
; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
@@ -20814,10 +23692,15 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_bf16:
@@ -20825,10 +23708,15 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_bf16:
@@ -20903,31 +23791,53 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v1, v1, v3
; GCN-NEXT: v_max_f32_e32 v0, v0, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v2bf16:
@@ -20937,9 +23847,11 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_max_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
@@ -20960,9 +23872,11 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
@@ -20982,49 +23896,53 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX10-LABEL: v_maxnum_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_max_f32_e32 v1, v3, v2
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_maxnum_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v1 :: v_dual_max_f32 v1, v3, v2
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
ret <2 x bfloat> %op
@@ -21036,45 +23954,77 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_max_f32_e32 v2, v2, v5
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v1, v1, v4
+; GCN-NEXT: v_max_f32_e32 v2, v2, v5
; GCN-NEXT: v_max_f32_e32 v0, v0, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v3bf16:
@@ -21093,10 +24043,12 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_max_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
@@ -21127,9 +24079,11 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
@@ -21149,31 +24103,33 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX10-LABEL: v_maxnum_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX10-NEXT: v_max_f32_e32 v2, v5, v4
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
@@ -21186,57 +24142,99 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_max_f32_e32 v1, v1, v5
; GCN-NEXT: v_max_f32_e32 v3, v3, v7
; GCN-NEXT: v_max_f32_e32 v2, v2, v6
-; GCN-NEXT: v_max_f32_e32 v1, v1, v5
; GCN-NEXT: v_max_f32_e32 v0, v0, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v4bf16:
@@ -21246,9 +24244,11 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_max_f32_e32 v4, v5, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
@@ -21265,9 +24265,11 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
@@ -21290,9 +24292,11 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
@@ -21307,9 +24311,11 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
@@ -21331,81 +24337,88 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v7, v6
-; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v2, v5, v6
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_maxnum_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX11-NEXT: v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4
-; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v2, v5, v6
+; GFX11-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX11-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
ret <4 x bfloat> %op
@@ -21417,105 +24430,189 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_max_f32_e32 v5, v5, v13
+; GCN-NEXT: v_max_f32_e32 v1, v1, v9
; GCN-NEXT: v_max_f32_e32 v7, v7, v15
; GCN-NEXT: v_max_f32_e32 v6, v6, v14
-; GCN-NEXT: v_max_f32_e32 v5, v5, v13
; GCN-NEXT: v_max_f32_e32 v4, v4, v12
; GCN-NEXT: v_max_f32_e32 v3, v3, v11
; GCN-NEXT: v_max_f32_e32 v2, v2, v10
-; GCN-NEXT: v_max_f32_e32 v1, v1, v9
; GCN-NEXT: v_max_f32_e32 v0, v0, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v15
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_max_f32_e32 v5, v5, v13
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v12
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v11
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v10
-; GFX7-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v9
; GFX7-NEXT: v_max_f32_e32 v0, v0, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v8bf16:
@@ -21525,9 +24622,11 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX8-NEXT: v_max_f32_e32 v8, v9, v8
; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_max_f32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
@@ -21544,9 +24643,11 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX8-NEXT: v_max_f32_e32 v7, v9, v7
; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_max_f32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
@@ -21562,9 +24663,11 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX8-NEXT: v_max_f32_e32 v6, v9, v6
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_max_f32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
@@ -21580,9 +24683,11 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX8-NEXT: v_max_f32_e32 v5, v9, v5
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_max_f32_e32 v0, v0, v4
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
@@ -21609,9 +24714,11 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v3, v3, v7
@@ -21626,9 +24733,11 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_max_f32_e32 v7, v9, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v6
; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
@@ -21642,9 +24751,11 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_max_f32_e32 v6, v9, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_max_f32_e32 v1, v1, v5
; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
@@ -21658,9 +24769,11 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v5, v9, v5
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v4
; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
@@ -21684,155 +24797,171 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_max_f32_e32 v8, v9, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
-; GFX10-NEXT: v_max_f32_e32 v7, v10, v9
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v9, v10, v9
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_add3_u32 v7, v11, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_max_f32_e32 v6, v10, v6
-; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX10-NEXT: v_add3_u32 v8, v11, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v12, v9, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_max_f32_e32 v9, v12, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v14, v2, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
-; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_max_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_max_f32_e32 v5, v15, v13
-; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX10-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_add3_u32 v5, v12, v9, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v4, v15, v14
; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v9
; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v12, v0, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_bfe_u32 v14, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
-; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
-; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
-; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
+; GFX10-NEXT: v_add3_u32 v9, v11, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v12, v0, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
+; GFX10-NEXT: v_add3_u32 v12, v14, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v4
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v6, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v12, v14, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v1, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_maxnum_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v7, 16, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-NEXT: v_max_f32_e32 v3, v3, v7
-; GFX11-NEXT: v_max_f32_e32 v7, v10, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
-; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT: v_max_f32_e32 v9, v10, v9
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-NEXT: v_add3_u32 v7, v11, v8, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
-; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_max_f32 v2, v2, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_max_f32_e32 v6, v10, v6
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v8, v11, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v10, v12, v9, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_max_f32 v1, v1, v5 :: v_dual_max_f32 v2, v2, v6
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v10, v14, v2, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT: v_dual_max_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
-; GFX11-NEXT: v_max_f32_e32 v5, v15, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v9, v12, v11
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_max_f32_e32 v4, v15, v14
; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v5, v12, v9, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v12, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v14, v4, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v11, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v10, v12, v0, 0x7fff
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
+; GFX11-NEXT: v_add3_u32 v12, v14, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v4
+; GFX11-NEXT: v_perm_b32 v2, v2, v6, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v14, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v1, v5, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
ret <8 x bfloat> %op
@@ -21842,207 +24971,375 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-LABEL: v_maxnum_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_max_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_max_f32_e32 v13, v13, v29
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_max_f32_e32 v12, v12, v28
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_max_f32_e32 v11, v11, v27
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_max_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_max_f32_e32 v9, v9, v25
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_max_f32_e32 v8, v8, v24
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_max_f32_e32 v7, v7, v23
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_max_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_max_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_max_f32_e32 v1, v1, v17
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_max_f32_e32 v14, v14, v17
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_max_f32_e32 v12, v12, v17
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_max_f32_e32 v11, v11, v17
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_max_f32_e32 v10, v10, v17
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_max_f32_e32 v8, v8, v17
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_max_f32_e32 v7, v7, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_max_f32_e32 v4, v4, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_max_f32_e32 v6, v6, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_max_f32_e32 v3, v3, v19
-; GCN-NEXT: v_max_f32_e32 v2, v2, v18
-; GCN-NEXT: v_max_f32_e32 v1, v1, v17
-; GCN-NEXT: v_max_f32_e32 v0, v0, v16
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_max_f32_e32 v4, v4, v19
+; GCN-NEXT: v_max_f32_e32 v3, v3, v18
+; GCN-NEXT: v_max_f32_e32 v2, v2, v17
+; GCN-NEXT: v_max_f32_e32 v0, v0, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GCN-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_max_f32_e32 v15, v15, v16
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_max_f32_e32 v29, v13, v29
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_max_f32_e32 v25, v9, v13
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_max_f32_e32 v31, v5, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v17
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_max_f32_e32 v32, v1, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v26
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
-; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
-; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
-; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v21
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v20
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v19
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX7-NEXT: v_max_f32_e32 v1, v1, v17
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v21
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v28
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v27
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_max_f32_e32 v14, v14, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v32
+; GFX7-NEXT: v_max_f32_e32 v12, v12, v25
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v23
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v22
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v16
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v13
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v20, 0, 16
+; GFX7-NEXT: v_bfe_u32 v5, v19, 0, 16
+; GFX7-NEXT: v_bfe_u32 v9, v18, 0, 16
+; GFX7-NEXT: v_bfe_u32 v13, v17, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v16bf16:
@@ -22052,10 +25349,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX8-NEXT: v_max_f32_e32 v16, v17, v16
; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v7, v7, v15
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
@@ -22071,9 +25370,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX8-NEXT: v_max_f32_e32 v15, v17, v15
; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v6, v6, v14
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
@@ -22089,9 +25390,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX8-NEXT: v_max_f32_e32 v14, v17, v14
; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v5, v5, v13
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
@@ -22107,9 +25410,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX8-NEXT: v_max_f32_e32 v13, v17, v13
; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
-; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v4, v4, v12
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
@@ -22125,9 +25430,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX8-NEXT: v_max_f32_e32 v12, v17, v12
; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v3, v3, v11
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
@@ -22143,9 +25450,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX8-NEXT: v_max_f32_e32 v11, v17, v11
; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v2, v2, v10
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
@@ -22161,9 +25470,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX8-NEXT: v_max_f32_e32 v10, v17, v10
; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v1, v1, v9
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
@@ -22179,9 +25490,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX8-NEXT: v_max_f32_e32 v9, v17, v9
; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v0, v0, v8
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
@@ -22216,9 +25529,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v7, v7, v15
@@ -22233,9 +25548,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_max_f32_e32 v15, v17, v15
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_max_f32_e32 v6, v6, v14
; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
@@ -22249,9 +25566,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_max_f32_e32 v14, v17, v14
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_max_f32_e32 v5, v5, v13
; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
@@ -22265,9 +25584,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_max_f32_e32 v13, v17, v13
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_max_f32_e32 v4, v4, v12
; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
@@ -22281,9 +25602,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_max_f32_e32 v12, v17, v12
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_max_f32_e32 v3, v3, v11
; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
@@ -22297,9 +25620,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_max_f32_e32 v11, v17, v11
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v10
; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
@@ -22313,9 +25638,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_max_f32_e32 v10, v17, v10
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_max_f32_e32 v1, v1, v9
; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
@@ -22329,9 +25656,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v9, v17, v9
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v8
; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
@@ -22359,139 +25688,155 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_max_f32_e32 v16, v17, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
-; GFX10-NEXT: v_max_f32_e32 v7, v7, v15
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
-; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX10-NEXT: v_max_f32_e32 v17, v18, v17
-; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
-; GFX10-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v15, v18, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v20, vcc_lo
; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
-; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
-; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT: v_bfe_u32 v21, v15, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
-; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
-; GFX10-NEXT: v_max_f32_e32 v17, v20, v19
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_add3_u32 v17, v21, v15, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v20, v6, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v14, v18, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v6
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
; GFX10-NEXT: v_max_f32_e32 v5, v5, v13
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
-; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v17, v19, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v20, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v14, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v13, v19, v18
-; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v19, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_max_f32_e32 v13, v20, v13
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
+; GFX10-NEXT: v_add3_u32 v18, v18, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX10-NEXT: v_max_f32_e32 v4, v4, v12
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v18, v20, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_max_f32_e32 v12, v18, v12
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_max_f32_e32 v12, v18, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v11
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX10-NEXT: v_bfe_u32 v18, v12, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
-; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
-; GFX10-NEXT: v_max_f32_e32 v18, v19, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v12
+; GFX10-NEXT: v_add3_u32 v18, v18, v12, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v10
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v22, v3, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v11, v19, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v18, v21, vcc_lo
+; GFX10-NEXT: v_add3_u32 v19, v22, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
-; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
-; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
-; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
-; GFX10-NEXT: v_max_f32_e32 v19, v22, v20
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_add3_u32 v18, v18, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_add3_u32 v17, v20, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v18, v22, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_max_f32_e32 v18, v19, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX10-NEXT: v_max_f32_e32 v1, v1, v9
-; GFX10-NEXT: v_max_f32_e32 v9, v22, v20
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX10-NEXT: v_max_f32_e32 v0, v0, v8
-; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v8, v23, v18, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_max_f32_e32 v19, v22, v19
+; GFX10-NEXT: v_bfe_u32 v22, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v23, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
-; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
-; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
+; GFX10-NEXT: v_add3_u32 v18, v22, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX10-NEXT: v_bfe_u32 v9, v19, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v11, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v18, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
+; GFX10-NEXT: v_add3_u32 v9, v9, v19, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
+; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v21, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v20, vcc_lo
; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -22499,156 +25844,170 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_max_f32_e32 v17, v18, v17
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-NEXT: v_max_f32_e32 v7, v7, v15
-; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
-; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v14, 16, v14
; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v21, v15, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX11-NEXT: v_max_f32_e32 v16, v17, v16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v16, v17, v20, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: v_add3_u32 v17, v21, v15, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v6, 16, 1
; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_max_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
-; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v15, v17, v19, vcc_lo
+; GFX11-NEXT: v_add3_u32 v17, v20, v6, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_max_f32_e32 v14, v18, v14
+; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v6
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v4, v4, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_bfe_u32 v19, v14, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_max_f32_e32 v5, v5, v13
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_max_f32 v13, v19, v18
-; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-NEXT: v_add3_u32 v17, v19, v14, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v13, v20, v13
; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_max_f32_e32 v12, v18, v12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_add3_u32 v18, v18, v5, 0x7fff
; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; GFX11-NEXT: v_max_f32_e32 v4, v4, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v18, v20, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
-; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v18, v19, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_dual_max_f32 v12, v18, v12 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-NEXT: v_add3_u32 v17, v20, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v12
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
; GFX11-NEXT: v_max_f32_e32 v3, v3, v11
-; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX11-NEXT: v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v22, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v11, v19, v11 :: v_dual_cndmask_b32 v12, v18, v21
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
+; GFX11-NEXT: v_add3_u32 v19, v22, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v11
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v18, v18, v11, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_lshlrev_b32 v19, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
-; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
-; GFX11-NEXT: v_max_f32_e32 v19, v22, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
-; GFX11-NEXT: v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v1, v1, v9
-; GFX11-NEXT: v_max_f32_e32 v9, v22, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
-; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
-; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v18, v22, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v11, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_add3_u32 v11, v11, v2, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v8 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v23, v18, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT: v_bfe_u32 v23, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v0
+; GFX11-NEXT: v_max_f32_e32 v19, v22, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v23, v23, v0, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v18
+; GFX11-NEXT: v_bfe_u32 v22, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_bfe_u32 v9, v19, 16, 1
+; GFX11-NEXT: v_add3_u32 v18, v22, v1, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v9, v9, v19, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v18, v22, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v23, v24, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v11, v21, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v20, vcc_lo
; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
@@ -22659,527 +26018,863 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-LABEL: v_maxnum_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_max_f32_e32 v29, v29, v31
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_max_f32_e32 v25, v25, v31
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_max_f32_e32 v21, v21, v31
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_max_f32_e32 v17, v17, v31
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_max_f32_e32 v13, v13, v31
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_max_f32_e32 v9, v9, v31
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_max_f32_e32 v5, v5, v31
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_max_f32_e32 v31, v1, v31
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v32
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
-; GCN-NEXT: v_max_f32_e32 v31, v31, v32
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_max_f32_e32 v1, v1, v32
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v30, v30, v32
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
-; GCN-NEXT: v_max_f32_e32 v29, v29, v32
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v28, v28, v32
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v27, v27, v32
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v26, v26, v32
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
-; GCN-NEXT: v_max_f32_e32 v25, v25, v32
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v24, v24, v32
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v23, v23, v32
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v22, v22, v32
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
-; GCN-NEXT: v_max_f32_e32 v21, v21, v32
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v20, v20, v32
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v19, v19, v32
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v18, v18, v32
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
-; GCN-NEXT: v_max_f32_e32 v17, v17, v32
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v16, v16, v32
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v15, v15, v32
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v14, v14, v32
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
-; GCN-NEXT: v_max_f32_e32 v13, v13, v32
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v12, v12, v32
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v11, v11, v32
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v10, v10, v32
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_max_f32_e32 v9, v9, v32
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v8, v8, v32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v7, v7, v32
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v6, v6, v32
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_max_f32_e32 v5, v5, v32
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v4, v4, v32
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v3, v3, v32
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v2, v2, v32
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_max_f32_e32 v1, v1, v32
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v0, v0, v32
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GCN-NEXT: v_bfe_u32 v1, v32, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GCN-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GCN-NEXT: v_bfe_u32 v17, v17, 0, 16
+; GCN-NEXT: v_bfe_u32 v21, v21, 0, 16
+; GCN-NEXT: v_bfe_u32 v25, v25, 0, 16
+; GCN-NEXT: v_bfe_u32 v29, v29, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_max_f32_e32 v29, v29, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_bfe_u32 v29, v29, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_max_f32_e32 v25, v25, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_bfe_u32 v25, v25, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_max_f32_e32 v21, v21, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_bfe_u32 v21, v21, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_max_f32_e32 v17, v17, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_bfe_u32 v17, v17, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_max_f32_e32 v13, v13, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_max_f32_e32 v31, v1, v31
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v30, v30, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v29, v29, v32
+; GFX7-NEXT: v_max_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v26, v26, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v25, v25, v32
+; GFX7-NEXT: v_max_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v22, v22, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v21, v21, v32
+; GFX7-NEXT: v_max_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v18, v18, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v17, v17, v32
+; GFX7-NEXT: v_max_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v14, v14, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v13, v13, v32
+; GFX7-NEXT: v_max_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v10, v10, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v9, v9, v32
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v32
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_max_f32_e32 v1, v1, v32
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v0, v0, v32
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v32, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v32bf16:
@@ -23189,10 +26884,12 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_max_f32_e32 v31, v32, v31
; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
-; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
; GFX8-NEXT: v_max_f32_e32 v14, v14, v30
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
@@ -23209,17 +26906,21 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_max_f32_e32 v32, v32, v30
; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_max_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GFX8-NEXT: v_max_f32_e32 v33, v33, v34
-; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX8-NEXT: v_max_f32_e32 v30, v15, v30
; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
@@ -23248,9 +26949,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
; GFX8-NEXT: v_max_f32_e32 v29, v33, v29
; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
-; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v12, v12, v28
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
@@ -23266,9 +26969,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX8-NEXT: v_max_f32_e32 v28, v33, v28
; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
-; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v11, v11, v27
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
@@ -23284,9 +26989,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX8-NEXT: v_max_f32_e32 v27, v33, v27
; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
-; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v10, v10, v26
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
@@ -23302,9 +27009,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX8-NEXT: v_max_f32_e32 v26, v33, v26
; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
-; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v9, v9, v25
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
@@ -23320,9 +27029,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX8-NEXT: v_max_f32_e32 v25, v33, v25
; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v8, v8, v24
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
@@ -23338,9 +27049,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX8-NEXT: v_max_f32_e32 v24, v33, v24
; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
-; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v7, v7, v23
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
@@ -23356,9 +27069,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX8-NEXT: v_max_f32_e32 v23, v33, v23
; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
-; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v6, v6, v22
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
@@ -23374,9 +27089,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX8-NEXT: v_max_f32_e32 v22, v33, v22
; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v5, v5, v21
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
@@ -23392,9 +27109,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX8-NEXT: v_max_f32_e32 v21, v33, v21
; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v4, v4, v20
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
@@ -23410,9 +27129,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX8-NEXT: v_max_f32_e32 v20, v33, v20
; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v3, v3, v19
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
@@ -23428,9 +27149,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX8-NEXT: v_max_f32_e32 v19, v33, v19
; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
-; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v2, v2, v18
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
@@ -23446,9 +27169,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX8-NEXT: v_max_f32_e32 v18, v33, v18
; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v1, v1, v17
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
@@ -23464,9 +27189,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX8-NEXT: v_max_f32_e32 v17, v33, v17
; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
-; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v0, v0, v16
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
@@ -23515,9 +27242,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX9-NEXT: v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX9-NEXT: v_max_f32_e32 v14, v14, v30
@@ -23532,59 +27261,67 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
; GFX9-NEXT: v_max_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v13, v13, v29
; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT: v_max_f32_e32 v32, v32, v29
-; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT: v_max_f32_e32 v33, v33, v34
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_max_f32_e32 v32, v32, v33
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX9-NEXT: v_max_f32_e32 v29, v15, v29
-; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
+; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v32
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc
+; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v29
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc
+; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1
+; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v13
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_max_f32_e32 v32, v33, v32
+; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
+; GFX9-NEXT: v_max_f32_e32 v12, v12, v28
; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX9-NEXT: v_max_f32_e32 v28, v33, v28
-; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX9-NEXT: v_max_f32_e32 v11, v11, v27
; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
@@ -23598,9 +27335,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX9-NEXT: v_max_f32_e32 v27, v33, v27
-; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX9-NEXT: v_max_f32_e32 v10, v10, v26
; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
@@ -23614,9 +27353,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX9-NEXT: v_max_f32_e32 v26, v33, v26
-; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX9-NEXT: v_max_f32_e32 v9, v9, v25
; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
@@ -23630,9 +27371,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_max_f32_e32 v25, v33, v25
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX9-NEXT: v_max_f32_e32 v8, v8, v24
; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
@@ -23646,9 +27389,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_max_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX9-NEXT: v_max_f32_e32 v7, v7, v23
; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
@@ -23662,9 +27407,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_max_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX9-NEXT: v_max_f32_e32 v6, v6, v22
; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
@@ -23678,9 +27425,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_max_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX9-NEXT: v_max_f32_e32 v5, v5, v21
; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
@@ -23694,9 +27443,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_max_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX9-NEXT: v_max_f32_e32 v4, v4, v20
; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
@@ -23710,9 +27461,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_max_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX9-NEXT: v_max_f32_e32 v3, v3, v19
; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
@@ -23726,9 +27479,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_max_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v18
; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
@@ -23742,9 +27497,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_max_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX9-NEXT: v_max_f32_e32 v1, v1, v17
; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
@@ -23758,9 +27515,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v16
; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
@@ -23795,557 +27554,627 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v21
; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v19
; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_max_f32_e32 v53, v54, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v17
+; GFX10-NEXT: v_max_f32_e32 v55, v64, v55
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX10-NEXT: v_max_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_max_f32_e32 v25, v54, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v18
+; GFX10-NEXT: v_max_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v65, v66, v65
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v16
+; GFX10-NEXT: v_max_f32_e32 v68, v68, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_bfe_u32 v21, v55, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v67
+; GFX10-NEXT: v_max_f32_e32 v34, v36, v34
+; GFX10-NEXT: v_max_f32_e32 v36, v48, v38
+; GFX10-NEXT: v_max_f32_e32 v38, v52, v50
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v48, v64, v54
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v52, v33, 16, 1
; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_max_f32_e32 v24, v64, v55
; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_max_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_max_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_bfe_u32 v23, v53, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v55
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_bfe_u32 v19, v65, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_bfe_u32 v17, v68, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX10-NEXT: v_add3_u32 v21, v21, v55, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v33
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v15
+; GFX10-NEXT: v_bfe_u32 v15, v35, 16, 1
; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v22, v30, v22
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v53
; GFX10-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_max_f32_e32 v18, v27, v48
-; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_max_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v65
; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_max_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_max_f32_e32 v51, v52, v51
-; GFX10-NEXT: v_max_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_max_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_max_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
+; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v68
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
+; GFX10-NEXT: v_add3_u32 v33, v52, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v52, v34, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v53, v53
+; GFX10-NEXT: v_add3_u32 v23, v23, v53, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v53, v48, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v65, v65
+; GFX10-NEXT: v_add3_u32 v19, v19, v65, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v68, v68
+; GFX10-NEXT: v_add3_u32 v17, v17, v68, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v20, v21, v20, s10
+; GFX10-NEXT: v_bfe_u32 v21, v8, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_max_f32_e32 v50, v69, v66
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v35
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v37
+; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_bfe_u32 v28, v39, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_bfe_u32 v27, v49, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v35, v35
+; GFX10-NEXT: v_add3_u32 v15, v15, v35, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v34
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v37, v37
+; GFX10-NEXT: v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v34, v34
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v48, v48
+; GFX10-NEXT: v_add3_u32 v34, v52, v34, 0x7fff
+; GFX10-NEXT: v_add3_u32 v48, v53, v48, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v30, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v19, v18, s11
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v17, v16, s12
+; GFX10-NEXT: v_bfe_u32 v17, v7, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v8, v8
+; GFX10-NEXT: v_add3_u32 v8, v21, v8, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v21, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v39
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v49
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX10-NEXT: v_bfe_u32 v25, v51, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v39, v39
+; GFX10-NEXT: v_add3_u32 v28, v28, v39, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v49, v49
+; GFX10-NEXT: v_add3_u32 v27, v27, v49, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v38, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v36, v36
+; GFX10-NEXT: v_bfe_u32 v52, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v37, v36, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v34, v34, v35, s13
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
-; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v48, v48, v55, s16
+; GFX10-NEXT: v_bfe_u32 v55, v5, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v7, v7
+; GFX10-NEXT: v_add3_u32 v7, v17, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v1, v1
+; GFX10-NEXT: v_add3_u32 v1, v21, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v51
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v51, v51
+; GFX10-NEXT: v_add3_u32 v25, v25, v51, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v38
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v38, v38
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v14
+; GFX10-NEXT: v_add3_u32 v38, v49, v38, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v36, v36, v39, s14
+; GFX10-NEXT: v_bfe_u32 v39, v22, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v52, v14, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v5, v5
+; GFX10-NEXT: v_add3_u32 v5, v55, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s16
+; GFX10-NEXT: v_bfe_u32 v65, v50, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v38, v38, v51, s15
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v22
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v22, v22
+; GFX10-NEXT: v_add3_u32 v22, v39, v22, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v52, s12
+; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v50
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v50, v50
+; GFX10-NEXT: v_bfe_u32 v49, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v65, v50, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v12, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v67, s6
+; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v19, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v19, v22, v51, s11
+; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v13
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v64, s4
+; GFX10-NEXT: v_bfe_u32 v64, v11, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v66, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v26, v27, v26, s7
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v10
+; GFX10-NEXT: v_cndmask_b32_e64 v24, v25, v24, s8
+; GFX10-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v50, v50, v68, s17
+; GFX10-NEXT: v_bfe_u32 v68, v4, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v49, v13, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v65, v12, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v3, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v67, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v39, v0, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s10
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v11
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v9
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v64, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v9, v9
+; GFX10-NEXT: v_add3_u32 v9, v25, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v0
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v4, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v0, v0
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v2, v2
+; GFX10-NEXT: v_add3_u32 v4, v68, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v65, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v67, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v0, v39, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v37, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v53, s4
+; GFX10-NEXT: v_perm_b32 v7, v7, v20, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v54, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v66, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v27, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v30, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v49, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v55, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v64, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v25, s17
+; GFX10-NEXT: v_perm_b32 v1, v1, v48, 0x7060302
+; GFX10-NEXT: v_perm_b32 v4, v4, v34, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v0, v50, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v36, 0x7060302
+; GFX10-NEXT: v_perm_b32 v2, v2, v38, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT: v_perm_b32 v10, v10, v26, 0x7060302
+; GFX10-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
+; GFX10-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v13, v15, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
-; GFX10-NEXT: v_max_f32_e32 v17, v31, v17
-; GFX10-NEXT: v_max_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
-; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_max_f32_e32 v21, v31, v21
+; GFX10-NEXT: v_max_f32_e32 v16, v6, v17
+; GFX10-NEXT: v_perm_b32 v6, v19, v18, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v17, v21, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v21
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
+; GFX10-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v16, v16
+; GFX10-NEXT: v_add3_u32 v16, v19, v16, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v20, s4
+; GFX10-NEXT: v_perm_b32 v15, v16, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_maxnum_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_b32 v32, off, s32
-; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
-; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
-; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v18
; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v6
+; GFX11-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_max_f32 v3, v3, v19
; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
-; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX11-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX11-NEXT: v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
; GFX11-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX11-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_max_f32 v18, v84, v83
-; GFX11-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
-; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
-; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
-; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
-; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_max_f32_e32 v4, v4, v20
-; GFX11-NEXT: v_max_f32_e32 v20, v80, v71
-; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27
+; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT: v_max_f32_e32 v26, v52, v51
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_max_f32_e32 v29, v38, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_max_f32_e32 v14, v14, v30
-; GFX11-NEXT: v_max_f32_e32 v28, v48, v39
-; GFX11-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
-; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
-; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
-; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
+; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_dual_max_f32 v33, v34, v33 :: v_dual_max_f32 v34, v36, v35
+; GFX11-NEXT: v_max_f32_e32 v35, v38, v37
+; GFX11-NEXT: v_max_f32_e32 v37, v50, v49
+; GFX11-NEXT: v_dual_max_f32 v49, v66, v65 :: v_dual_max_f32 v50, v68, v67
+; GFX11-NEXT: v_dual_max_f32 v38, v52, v51 :: v_dual_max_f32 v51, v70, v69
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v23, v49, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_bfe_u32 v27, v37, 16, 1
+; GFX11-NEXT: v_bfe_u32 v21, v51, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_add3_u32 v23, v23, v49, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_add3_u32 v21, v21, v51, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: v_max_f32_e32 v36, v48, v39
+; GFX11-NEXT: v_or_b32_e32 v69, 0x400000, v37
+; GFX11-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX11-NEXT: v_max_f32_e32 v48, v64, v55
+; GFX11-NEXT: v_bfe_u32 v64, v33, 16, 1
+; GFX11-NEXT: v_bfe_u32 v28, v36, 16, 1
+; GFX11-NEXT: v_bfe_u32 v26, v38, 16, 1
+; GFX11-NEXT: v_add3_u32 v27, v27, v37, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_or_b32_e32 v65, 0x400000, v33
+; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v36
+; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v38
+; GFX11-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX11-NEXT: v_dual_max_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT: v_bfe_u32 v30, v34, 16, 1
+; GFX11-NEXT: v_add3_u32 v16, v64, v33, 0x7fff
+; GFX11-NEXT: v_add3_u32 v28, v28, v36, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_add3_u32 v26, v26, v38, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
-; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
-; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
-; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v34
+; GFX11-NEXT: v_add3_u32 v30, v30, v34, 0x7fff
+; GFX11-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX11-NEXT: v_bfe_u32 v29, v35, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v65, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v35
+; GFX11-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
+; GFX11-NEXT: v_max_f32_e32 v52, v80, v71
+; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v66, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-NEXT: v_max_f32_e32 v39, v54, v53
+; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_bfe_u32 v20, v52, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v29, v29, v67, vcc_lo
+; GFX11-NEXT: v_max_f32_e32 v54, v84, v83
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-NEXT: v_max_f32_e32 v55, v86, v85
+; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v52
+; GFX11-NEXT: v_add3_u32 v20, v20, v52, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v18, v54, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v54
+; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v68, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-NEXT: v_add3_u32 v18, v18, v54, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v39
+; GFX11-NEXT: v_bfe_u32 v24, v48, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v27, v27, v69, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX11-NEXT: v_bfe_u32 v25, v39, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v48
+; GFX11-NEXT: v_add3_u32 v24, v24, v48, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v70, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-NEXT: v_add3_u32 v25, v25, v39, 0x7fff
+; GFX11-NEXT: v_max_f32_e32 v53, v82, v81
+; GFX11-NEXT: v_or_b32_e32 v81, 0x400000, v49
+; GFX11-NEXT: v_bfe_u32 v22, v50, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v50
+; GFX11-NEXT: v_cndmask_b32_e32 v25, v25, v71, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v51
+; GFX11-NEXT: v_add3_u32 v22, v22, v50, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v53, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v85, 0x400000, v53
+; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v80, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-NEXT: v_bfe_u32 v17, v55, 16, 1
+; GFX11-NEXT: v_add3_u32 v19, v19, v53, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v87, 0x400000, v55
+; GFX11-NEXT: v_bfe_u32 v64, v14, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v23, v23, v81, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-NEXT: v_add3_u32 v17, v17, v55, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v14
+; GFX11-NEXT: v_bfe_u32 v97, v13, 16, 1
+; GFX11-NEXT: v_add3_u32 v64, v64, v14, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v82, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v13
+; GFX11-NEXT: v_bfe_u32 v99, v12, 16, 1
+; GFX11-NEXT: v_add3_u32 v34, v97, v13, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v21, v21, v83, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-NEXT: v_bfe_u32 v101, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v35, v99, v12, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v11
+; GFX11-NEXT: v_bfe_u32 v103, v10, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v84, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-NEXT: v_add3_u32 v36, v101, v11, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v10
+; GFX11-NEXT: v_bfe_u32 v113, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v37, v103, v10, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v85, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v9
+; GFX11-NEXT: v_bfe_u32 v115, v8, 16, 1
+; GFX11-NEXT: v_add3_u32 v38, v113, v9, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v117, v7, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v18, v18, v86, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-NEXT: v_add3_u32 v39, v115, v8, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v7
+; GFX11-NEXT: v_bfe_u32 v119, v6, 16, 1
+; GFX11-NEXT: v_add3_u32 v48, v117, v7, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v87, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
-; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
-; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
-; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
-; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v129, v5, 16, 1
+; GFX11-NEXT: v_add3_u32 v49, v119, v6, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v5
+; GFX11-NEXT: v_bfe_u32 v131, v4, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v64, v96, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
-; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
-; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
+; GFX11-NEXT: v_add3_u32 v50, v129, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v133, v3, 16, 1
+; GFX11-NEXT: v_add3_u32 v51, v131, v4, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v13, v34, v98, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
-; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
-; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
-; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v52, v133, v3, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v145, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v12, v35, v100, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
-; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
-; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
+; GFX11-NEXT: v_add3_u32 v55, v147, v0, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v54, v145, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v135, v2, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v11, v36, v102, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
-; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
-; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
+; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT: v_add3_u32 v53, v135, v2, 0x7fff
+; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v37, v112, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
-; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
+; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v38, v114, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v39, v116, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v48, v118, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
+; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v49, v128, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v50, v130, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v51, v132, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v52, v134, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v55, v33, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v32
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v54, v146, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_max_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
-; GFX11-NEXT: v_max_f32_e32 v15, v15, v18
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v33
+; GFX11-NEXT: v_dual_max_f32 v17, v31, v17 :: v_dual_cndmask_b32 v2, v53, v144
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v15, v15, v18
; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
@@ -24361,7 +28190,8 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0xf800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x260
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -24378,14 +28208,16 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GCN-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sqrt_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0xf800000
; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
@@ -24404,7 +28236,8 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v2, 0x260
; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sqrt_bf16:
@@ -24544,18 +28377,22 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_ldexp_bf16_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ldexp_bf16_i32:
@@ -24626,23 +28463,28 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x7f800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_frexp_mant_f32_e32 v1, v0
; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_frexp_bf16_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_frexp_bf16_i16:
@@ -24706,7 +28548,8 @@ define bfloat @v_log_bf16(bfloat %a) {
; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GCN-NEXT: s_mov_b32 s5, 0x7f800000
; GCN-NEXT: v_mov_b32_e32 v2, 0x41b17218
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
@@ -24724,14 +28567,16 @@ define bfloat @v_log_bf16(bfloat %a) {
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_log_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
@@ -24750,7 +28595,8 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v1, 0x41b17218
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_log_bf16:
@@ -24884,21 +28730,24 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GCN-NEXT: s_mov_b32 s4, 0x800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GCN-NEXT: v_mov_b32_e32 v2, 0x42000000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-NEXT: v_log_f32_e32 v0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_log2_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
@@ -24908,7 +28757,8 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_log2_bf16:
@@ -25008,7 +28858,8 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GCN-NEXT: s_mov_b32 s5, 0x7f800000
; GCN-NEXT: v_mov_b32_e32 v2, 0x411a209b
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
@@ -25026,14 +28877,16 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_log10_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
@@ -25052,7 +28905,8 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v1, 0x411a209b
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_log10_bf16:
@@ -25190,7 +29044,8 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GCN-NEXT: s_mov_b32 s4, 0xc2ce8ed0
; GCN-NEXT: s_mov_b32 s5, 0x42b17218
; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v0
; GCN-NEXT: v_sub_f32_e32 v3, v0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0
@@ -25208,14 +29063,16 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_exp_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0x3fb8aa3b
; GFX7-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
@@ -25234,7 +29091,8 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_exp_bf16:
@@ -25369,21 +29227,24 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000
; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000
; GCN-NEXT: v_mov_b32_e32 v2, 0x1f800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-NEXT: v_exp_f32_e32 v0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_exp2_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0xc2fc0000
; GFX7-NEXT: v_mov_b32_e32 v1, 0x42800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
@@ -25393,7 +29254,8 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v1, 0x1f800000
; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_exp2_bf16:
@@ -25492,7 +29354,8 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GCN-NEXT: s_mov_b32 s4, 0xc23369f4
; GCN-NEXT: s_mov_b32 s5, 0x421a209b
; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x40549000, v0
; GCN-NEXT: v_sub_f32_e32 v3, v0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0
@@ -25510,14 +29373,16 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_exp10_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0x40549a78
; GFX7-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
@@ -25536,7 +29401,8 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_exp10_bf16:
@@ -25670,18 +29536,22 @@ define bfloat @v_ceil_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_ceil_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_ceil_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_ceil_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ceil_bf16:
@@ -25751,18 +29621,22 @@ define bfloat @v_trunc_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_trunc_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_trunc_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_trunc_bf16:
@@ -25832,18 +29706,22 @@ define bfloat @v_rint_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_rndne_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_rint_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rint_bf16:
@@ -25913,18 +29791,22 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_rndne_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_nearbyint_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_nearbyint_bf16:
@@ -25994,7 +29876,8 @@ define bfloat @v_round_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v0
; GCN-NEXT: v_sub_f32_e32 v2, v0, v1
; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -26002,14 +29885,16 @@ define bfloat @v_round_bf16(bfloat %a) {
; GCN-NEXT: s_brev_b32 s4, -2
; GCN-NEXT: v_bfi_b32 v0, s4, v2, v0
; GCN-NEXT: v_add_f32_e32 v0, v1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_round_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_trunc_f32_e32 v1, v0
; GFX7-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -26017,7 +29902,8 @@ define bfloat @v_round_bf16(bfloat %a) {
; GFX7-NEXT: s_brev_b32 s4, -2
; GFX7-NEXT: v_bfi_b32 v0, s4, v2, v0
; GFX7-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_round_bf16:
@@ -26111,18 +29997,22 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_rndne_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_bf16:
@@ -26192,18 +30082,22 @@ define bfloat @v_floor_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_floor_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_floor_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_floor_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_floor_bf16:
@@ -26273,16 +30167,22 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_canonicalize_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_canonicalize_bf16:
@@ -26399,8 +30299,10 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26410,8 +30312,10 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -26462,8 +30366,10 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26473,8 +30379,10 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -26525,8 +30433,10 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26536,8 +30446,10 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -26588,8 +30500,10 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26599,8 +30513,10 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -26651,8 +30567,10 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26662,8 +30580,10 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -26714,8 +30634,10 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26725,8 +30647,10 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -26777,8 +30701,10 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26788,8 +30714,10 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -26840,8 +30768,10 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26851,8 +30781,10 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -26903,8 +30835,10 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26914,8 +30848,10 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -26966,8 +30902,10 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26977,8 +30915,10 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -27029,8 +30969,10 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -27040,8 +30982,10 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -27092,8 +31036,10 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -27103,8 +31049,10 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -27155,8 +31103,10 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -27166,8 +31116,10 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -28261,7 +32213,8 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -28269,7 +32222,8 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -28311,14 +32265,16 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v2bf16_to_v2i16:
@@ -28326,31 +32282,35 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
@@ -28360,23 +32320,25 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
+; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <2 x bfloat> %x to <2 x i16>
ret <2 x i16> %op
@@ -28389,9 +32351,12 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2
@@ -28408,11 +32373,14 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -28424,20 +32392,22 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cvt_i32_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -28449,27 +32419,30 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <3 x bfloat> %x to <3 x i16>
ret <3 x i16> %op
@@ -28483,10 +32456,14 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
@@ -28498,7 +32475,7 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: v_or_b32_e32 v2, v2, v4
; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT: v_bfe_u32 v3, v3, 0, 16
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v4bf16_to_v4i16:
@@ -28506,12 +32483,16 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
@@ -28523,31 +32504,35 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT: v_bfe_u32 v3, v3, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cvt_i32_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_i32_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
@@ -28560,34 +32545,39 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
-; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <4 x bfloat> %x to <4 x i16>
ret <4 x i16> %op
@@ -28598,7 +32588,8 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -28606,7 +32597,8 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -28648,8 +32640,10 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -28659,8 +32653,10 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -28669,8 +32665,9 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -28679,8 +32676,9 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -28688,20 +32686,23 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
; GFX10-LABEL: v_fptosi_v2bf16_to_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v1
-; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v2bf16_to_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v1
-; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <2 x bfloat> %x to <2 x i32>
ret <2 x i32> %op
@@ -28714,9 +32715,12 @@ define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
@@ -28728,9 +32732,12 @@ define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
@@ -28740,50 +32747,52 @@ define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v2
-; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v0
+; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v3bf16_to_v3i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2
-; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v0
+; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v3bf16_to_v3i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v2
-; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v3
-; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v2
+; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v3bf16_to_v3i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v2
-; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v2
+; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <3 x bfloat> %x to <3 x i32>
ret <3 x i32> %op
@@ -28797,10 +32806,14 @@ define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
@@ -28814,10 +32827,14 @@ define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
@@ -28828,58 +32845,67 @@ define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_cvt_i32_f32_e32 v5, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v2
-; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_cvt_i32_f32_e32 v5, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v4bf16_to_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2
-; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v4bf16_to_v4i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v2
-; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v3
-; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v4
-; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v1
+; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v4
+; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v4bf16_to_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v2
-; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4
-; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v5
+; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v4
+; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <4 x bfloat> %x to <4 x i32>
ret <4 x i32> %op
@@ -28892,7 +32918,8 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x2f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_trunc_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e64 v1, |v0|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0
@@ -28910,7 +32937,8 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
; GFX7-NEXT: v_mul_f32_e64 v1, |v0|, s4
@@ -29014,8 +33042,10 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x2f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_trunc_f32_e32 v0, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_mul_f32_e64 v2, |v0|, s4
@@ -29044,7 +33074,8 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
; GFX7-NEXT: v_mul_f32_e64 v2, |v0|, s4
@@ -29053,8 +33084,9 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
; GFX7-NEXT: v_fma_f32 v3, v2, s5, |v0|
; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_xor_b32_e32 v0, v3, v4
; GFX7-NEXT: v_trunc_f32_e32 v3, v1
; GFX7-NEXT: v_mul_f32_e64 v1, |v3|, s4
@@ -29079,8 +33111,9 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_mul_f32_e64 v2, |v1|, s4
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_floor_f32_e32 v2, v2
; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
; GFX8-NEXT: v_trunc_f32_e32 v4, v0
@@ -29110,8 +33143,9 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e64 v2, |v1|, s4
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_floor_f32_e32 v2, v2
; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
; GFX9-NEXT: v_trunc_f32_e32 v4, v0
@@ -29138,28 +33172,29 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_trunc_f32_e32 v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_trunc_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v1|
-; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v0|
-; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; GFX10-NEXT: v_trunc_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v1|
; GFX10-NEXT: v_floor_f32_e32 v2, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; GFX10-NEXT: v_floor_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v1|
-; GFX10-NEXT: v_fma_f32 v5, 0xcf800000, v3, |v0|
-; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX10-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v0|
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v4
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v4
-; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v5
-; GFX10-NEXT: v_xor_b32_e32 v2, v2, v1
+; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5
+; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT: v_xor_b32_e32 v1, v1, v5
; GFX10-NEXT: v_xor_b32_e32 v3, v3, v6
-; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
-; GFX10-NEXT: v_xor_b32_e32 v4, v4, v6
-; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
+; GFX10-NEXT: v_xor_b32_e32 v4, v0, v6
+; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -29167,35 +33202,38 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
; GFX11-LABEL: v_fptosi_v2bf16_to_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_trunc_f32_e32 v1, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_trunc_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v1|
-; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v0|
-; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; GFX11-NEXT: v_trunc_f32_e32 v1, v1
+; GFX11-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v1|
; GFX11-NEXT: v_floor_f32_e32 v2, v2
+; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_floor_f32_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v1|
-; GFX11-NEXT: v_fma_f32 v5, 0xcf800000, v3, |v0|
-; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX11-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v0|
; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v4
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v4
-; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_xor_b32_e32 v2, v2, v1
-; GFX11-NEXT: v_xor_b32_e32 v3, v3, v6
+; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5
+; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1
-; GFX11-NEXT: v_xor_b32_e32 v4, v4, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
+; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX11-NEXT: v_xor_b32_e32 v3, v3, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_xor_b32_e32 v4, v0, v6
+; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -29212,9 +33250,12 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x2f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_trunc_f32_e32 v0, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_trunc_f32_e32 v2, v2
@@ -29254,7 +33295,8 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4
@@ -29263,8 +33305,9 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
; GFX7-NEXT: v_fma_f32 v4, v3, s5, |v0|
; GFX7-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_xor_b32_e32 v0, v4, v5
; GFX7-NEXT: v_trunc_f32_e32 v4, v1
; GFX7-NEXT: v_mul_f32_e64 v1, |v4|, s4
@@ -29273,12 +33316,13 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
; GFX7-NEXT: v_fma_f32 v6, v1, s5, |v4|
; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_xor_b32_e32 v3, v3, v5
; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v1
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v5, vcc
; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v4
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_xor_b32_e32 v5, v6, v3
; GFX7-NEXT: v_trunc_f32_e32 v6, v2
; GFX7-NEXT: v_mul_f32_e64 v2, |v6|, s4
@@ -29306,38 +33350,39 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
; GFX8-NEXT: v_floor_f32_e32 v3, v3
; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
; GFX8-NEXT: v_fma_f32 v4, v3, s5, |v2|
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GFX8-NEXT: v_trunc_f32_e32 v5, v0
+; GFX8-NEXT: v_trunc_f32_e32 v5, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e64 v0, |v5|, s4
-; GFX8-NEXT: v_floor_f32_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e64 v1, |v5|, s4
+; GFX8-NEXT: v_floor_f32_e32 v1, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
-; GFX8-NEXT: v_fma_f32 v6, v0, s5, |v5|
+; GFX8-NEXT: v_fma_f32 v6, v1, s5, |v5|
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2
-; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v0
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v2
-; GFX8-NEXT: v_trunc_f32_e32 v1, v1
-; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v3, v2, vcc
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5
-; GFX8-NEXT: v_mul_f32_e64 v5, |v1|, s4
+; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v4, v2
+; GFX8-NEXT: v_trunc_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v1
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5
+; GFX8-NEXT: v_mul_f32_e64 v5, |v0|, s4
; GFX8-NEXT: v_floor_f32_e32 v5, v5
-; GFX8-NEXT: v_xor_b32_e32 v2, v7, v3
-; GFX8-NEXT: v_fma_f32 v7, v5, s5, |v1|
+; GFX8-NEXT: v_xor_b32_e32 v4, v7, v2
+; GFX8-NEXT: v_fma_f32 v7, v5, s5, |v0|
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GFX8-NEXT: v_xor_b32_e32 v4, v8, v3
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
-; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
-; GFX8-NEXT: v_xor_b32_e32 v4, v7, v1
-; GFX8-NEXT: v_xor_b32_e32 v5, v5, v1
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, v6
+; GFX8-NEXT: v_xor_b32_e32 v3, v8, v2
+; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v5
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v2, vcc
+; GFX8-NEXT: v_xor_b32_e32 v2, v7, v0
+; GFX8-NEXT: v_xor_b32_e32 v3, v8, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc
+; GFX8-NEXT: v_mov_b32_e32 v0, v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64:
@@ -29350,125 +33395,129 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
; GFX9-NEXT: v_floor_f32_e32 v3, v3
; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2|
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT: v_trunc_f32_e32 v5, v0
+; GFX9-NEXT: v_trunc_f32_e32 v5, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4
-; GFX9-NEXT: v_floor_f32_e32 v0, v0
+; GFX9-NEXT: v_mul_f32_e64 v1, |v5|, s4
+; GFX9-NEXT: v_floor_f32_e32 v1, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5|
+; GFX9-NEXT: v_fma_f32 v6, v1, s5, |v5|
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v0
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT: v_trunc_f32_e32 v1, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT: v_mul_f32_e64 v5, |v1|, s4
+; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v4, v2
+; GFX9-NEXT: v_trunc_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5
+; GFX9-NEXT: v_mul_f32_e64 v5, |v0|, s4
; GFX9-NEXT: v_floor_f32_e32 v5, v5
-; GFX9-NEXT: v_xor_b32_e32 v2, v7, v3
-; GFX9-NEXT: v_fma_f32 v7, v5, s5, |v1|
+; GFX9-NEXT: v_xor_b32_e32 v4, v7, v2
+; GFX9-NEXT: v_fma_f32 v7, v5, s5, |v0|
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT: v_xor_b32_e32 v4, v8, v3
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v4, v7, v1
-; GFX9-NEXT: v_xor_b32_e32 v5, v5, v1
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-NEXT: v_xor_b32_e32 v3, v8, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v5
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v2, vcc
+; GFX9-NEXT: v_xor_b32_e32 v2, v7, v0
+; GFX9-NEXT: v_xor_b32_e32 v3, v8, v0
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v0, v6
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_trunc_f32_e32 v2, v2
-; GFX10-NEXT: v_trunc_f32_e32 v0, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_trunc_f32_e32 v1, v1
; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v2|
-; GFX10-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v0|
-; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v1|
+; GFX10-NEXT: v_trunc_f32_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v1|
; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v0
+; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; GFX10-NEXT: v_floor_f32_e32 v3, v3
+; GFX10-NEXT: v_mul_f32_e64 v7, 0x2f800000, |v0|
; GFX10-NEXT: v_floor_f32_e32 v4, v4
-; GFX10-NEXT: v_floor_f32_e32 v6, v6
-; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v0
; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v3, |v2|
-; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v4, |v0|
-; GFX10-NEXT: v_fma_f32 v1, 0xcf800000, v6, |v1|
+; GFX10-NEXT: v_floor_f32_e32 v7, v7
+; GFX10-NEXT: v_fma_f32 v1, 0xcf800000, v4, |v1|
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v7, |v0|
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_xor_b32_e32 v3, v3, v5
-; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5
-; GFX10-NEXT: v_xor_b32_e32 v9, v0, v7
-; GFX10-NEXT: v_xor_b32_e32 v4, v4, v7
-; GFX10-NEXT: v_xor_b32_e32 v10, v1, v8
-; GFX10-NEXT: v_xor_b32_e32 v6, v6, v8
+; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v0
+; GFX10-NEXT: v_xor_b32_e32 v9, v4, v6
+; GFX10-NEXT: v_xor_b32_e32 v4, v1, v6
+; GFX10-NEXT: v_xor_b32_e32 v7, v7, v8
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
+; GFX10-NEXT: v_xor_b32_e32 v2, v10, v8
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v6
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v9, v6, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v8
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v7, v8, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v3bf16_to_v3i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_trunc_f32_e32 v2, v2
-; GFX11-NEXT: v_trunc_f32_e32 v0, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_trunc_f32_e32 v1, v1
; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v2|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v0|
-; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v1|
+; GFX11-NEXT: v_trunc_f32_e32 v0, v0
+; GFX11-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v1|
; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0
+; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; GFX11-NEXT: v_floor_f32_e32 v3, v3
+; GFX11-NEXT: v_mul_f32_e64 v7, 0x2f800000, |v0|
; GFX11-NEXT: v_floor_f32_e32 v4, v4
-; GFX11-NEXT: v_floor_f32_e32 v6, v6
-; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v3, |v2|
-; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v4, |v0|
+; GFX11-NEXT: v_floor_f32_e32 v7, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_fma_f32 v1, 0xcf800000, v6, |v1|
+; GFX11-NEXT: v_fma_f32 v1, 0xcf800000, v4, |v1|
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v7, |v0|
; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX11-NEXT: v_xor_b32_e32 v3, v3, v5
-; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX11-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5
-; GFX11-NEXT: v_xor_b32_e32 v9, v0, v7
-; GFX11-NEXT: v_xor_b32_e32 v4, v4, v7
-; GFX11-NEXT: v_xor_b32_e32 v10, v1, v8
-; GFX11-NEXT: v_xor_b32_e32 v6, v6, v8
+; GFX11-NEXT: v_cvt_u32_f32_e32 v10, v0
+; GFX11-NEXT: v_xor_b32_e32 v9, v4, v6
+; GFX11-NEXT: v_xor_b32_e32 v4, v1, v6
+; GFX11-NEXT: v_xor_b32_e32 v7, v7, v8
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
+; GFX11-NEXT: v_xor_b32_e32 v2, v10, v8
; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
-; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
-; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v6
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v9, v6, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v8
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v7, v8, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <3 x bfloat> %x to <3 x i64>
ret <3 x i64> %op
@@ -29484,10 +33533,14 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x2f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_trunc_f32_e32 v0, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_trunc_f32_e32 v2, v2
@@ -29538,18 +33591,21 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4
; GFX7-NEXT: v_floor_f32_e32 v3, v3
; GFX7-NEXT: s_mov_b32 s5, 0xcf800000
; GFX7-NEXT: v_fma_f32 v5, v3, s5, |v0|
; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_xor_b32_e32 v0, v5, v6
; GFX7-NEXT: v_trunc_f32_e32 v5, v1
; GFX7-NEXT: v_mul_f32_e64 v1, |v5|, s4
@@ -29558,12 +33614,13 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
; GFX7-NEXT: v_fma_f32 v7, v1, s5, |v5|
; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_xor_b32_e32 v3, v3, v6
; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v1
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v5
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_xor_b32_e32 v6, v7, v3
; GFX7-NEXT: v_trunc_f32_e32 v7, v2
; GFX7-NEXT: v_mul_f32_e64 v2, |v7|, s4
@@ -29575,7 +33632,7 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v6, v3
; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v7
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_xor_b32_e32 v7, v8, v5
; GFX7-NEXT: v_trunc_f32_e32 v8, v4
; GFX7-NEXT: v_mul_f32_e64 v4, |v8|, s4
@@ -29603,50 +33660,53 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
; GFX8-NEXT: v_floor_f32_e32 v3, v3
; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
; GFX8-NEXT: v_fma_f32 v4, v3, s5, |v2|
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GFX8-NEXT: v_trunc_f32_e32 v5, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e64 v0, |v5|, s4
-; GFX8-NEXT: v_floor_f32_e32 v0, v0
-; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
-; GFX8-NEXT: v_fma_f32 v6, v0, s5, |v5|
-; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2
-; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2
-; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v0
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v2
-; GFX8-NEXT: v_subb_u32_e32 v8, vcc, v3, v2, vcc
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX8-NEXT: v_trunc_f32_e32 v5, v5
-; GFX8-NEXT: v_xor_b32_e32 v2, v6, v3
; GFX8-NEXT: v_mul_f32_e64 v6, |v5|, s4
+; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX8-NEXT: v_floor_f32_e32 v6, v6
-; GFX8-NEXT: v_xor_b32_e32 v4, v7, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2
; GFX8-NEXT: v_fma_f32 v7, v6, s5, |v5|
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2
+; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v4, v2
+; GFX8-NEXT: v_trunc_f32_e32 v0, v0
+; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v3, v2, vcc
+; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5
+; GFX8-NEXT: v_mul_f32_e64 v5, |v0|, s4
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
-; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT: v_trunc_f32_e32 v1, v1
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
-; GFX8-NEXT: v_xor_b32_e32 v4, v7, v5
-; GFX8-NEXT: v_mul_f32_e64 v7, |v1|, s4
-; GFX8-NEXT: v_floor_f32_e32 v7, v7
+; GFX8-NEXT: v_floor_f32_e32 v5, v5
+; GFX8-NEXT: v_xor_b32_e32 v3, v6, v2
+; GFX8-NEXT: v_fma_f32 v6, v5, s5, |v0|
; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX8-NEXT: v_fma_f32 v9, v7, s5, |v1|
-; GFX8-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_xor_b32_e32 v4, v7, v2
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
+; GFX8-NEXT: v_trunc_f32_e32 v1, v1
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v2, vcc
+; GFX8-NEXT: v_xor_b32_e32 v2, v6, v0
+; GFX8-NEXT: v_mul_f32_e64 v6, |v1|, s4
+; GFX8-NEXT: v_floor_f32_e32 v6, v6
+; GFX8-NEXT: v_xor_b32_e32 v3, v7, v0
+; GFX8-NEXT: v_fma_f32 v7, v6, s5, |v1|
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX8-NEXT: v_xor_b32_e32 v6, v6, v5
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
-; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
-; GFX8-NEXT: v_xor_b32_e32 v6, v9, v1
-; GFX8-NEXT: v_xor_b32_e32 v7, v7, v1
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v1
-; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc
+; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1
+; GFX8-NEXT: v_xor_b32_e32 v1, v6, v0
+; GFX8-NEXT: v_xor_b32_e32 v6, v7, v0
+; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v1, v0, vcc
+; GFX8-NEXT: v_mov_b32_e32 v0, v8
+; GFX8-NEXT: v_mov_b32_e32 v1, v9
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64:
@@ -29659,161 +33719,168 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
; GFX9-NEXT: v_floor_f32_e32 v3, v3
; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2|
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT: v_trunc_f32_e32 v5, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4
-; GFX9-NEXT: v_floor_f32_e32 v0, v0
-; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5|
-; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
-; GFX9-NEXT: v_xor_b32_e32 v2, v6, v3
; GFX9-NEXT: v_mul_f32_e64 v6, |v5|, s4
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX9-NEXT: v_floor_f32_e32 v6, v6
-; GFX9-NEXT: v_xor_b32_e32 v4, v7, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
; GFX9-NEXT: v_fma_f32 v7, v6, s5, |v5|
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
+; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v4, v2
+; GFX9-NEXT: v_trunc_f32_e32 v0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v3, v2, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5
+; GFX9-NEXT: v_mul_f32_e64 v5, |v0|, s4
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT: v_trunc_f32_e32 v1, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v4, v7, v5
-; GFX9-NEXT: v_mul_f32_e64 v7, |v1|, s4
-; GFX9-NEXT: v_floor_f32_e32 v7, v7
+; GFX9-NEXT: v_floor_f32_e32 v5, v5
+; GFX9-NEXT: v_xor_b32_e32 v3, v6, v2
+; GFX9-NEXT: v_fma_f32 v6, v5, s5, |v0|
; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT: v_fma_f32 v9, v7, s5, |v1|
-; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_xor_b32_e32 v4, v7, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0
+; GFX9-NEXT: v_trunc_f32_e32 v1, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v2, vcc
+; GFX9-NEXT: v_xor_b32_e32 v2, v6, v0
+; GFX9-NEXT: v_mul_f32_e64 v6, |v1|, s4
+; GFX9-NEXT: v_floor_f32_e32 v6, v6
+; GFX9-NEXT: v_xor_b32_e32 v3, v7, v0
+; GFX9-NEXT: v_fma_f32 v7, v6, s5, |v1|
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT: v_xor_b32_e32 v6, v6, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT: v_xor_b32_e32 v6, v9, v1
-; GFX9-NEXT: v_xor_b32_e32 v7, v7, v1
-; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, v8
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v0, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v1
+; GFX9-NEXT: v_xor_b32_e32 v1, v6, v0
+; GFX9-NEXT: v_xor_b32_e32 v6, v7, v0
+; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v0, v8
+; GFX9-NEXT: v_mov_b32_e32 v1, v9
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_trunc_f32_e32 v2, v2
-; GFX10-NEXT: v_trunc_f32_e32 v0, v0
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
-; GFX10-NEXT: v_trunc_f32_e32 v4, v1
-; GFX10-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2|
-; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
-; GFX10-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3|
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v3|
+; GFX10-NEXT: v_trunc_f32_e32 v0, v0
+; GFX10-NEXT: v_trunc_f32_e32 v8, v1
; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GFX10-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4|
-; GFX10-NEXT: v_floor_f32_e32 v1, v1
-; GFX10-NEXT: v_floor_f32_e32 v6, v6
-; GFX10-NEXT: v_floor_f32_e32 v8, v8
-; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v0
-; GFX10-NEXT: v_floor_f32_e32 v9, v9
+; GFX10-NEXT: v_floor_f32_e32 v1, v4
+; GFX10-NEXT: v_floor_f32_e32 v4, v6
+; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
+; GFX10-NEXT: v_mul_f32_e64 v10, 0x2f800000, |v8|
+; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v3
; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2|
-; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
-; GFX10-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT: v_fma_f32 v3, 0xcf800000, v8, |v3|
+; GFX10-NEXT: v_fma_f32 v3, 0xcf800000, v4, |v3|
+; GFX10-NEXT: v_floor_f32_e32 v6, v6
+; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GFX10-NEXT: v_floor_f32_e32 v10, v10
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX10-NEXT: v_fma_f32 v11, 0xcf800000, v9, |v4|
-; GFX10-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v0
+; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5
-; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v3
-; GFX10-NEXT: v_xor_b32_e32 v3, v0, v7
-; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GFX10-NEXT: v_xor_b32_e32 v6, v6, v7
-; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11
+; GFX10-NEXT: v_xor_b32_e32 v11, v4, v7
+; GFX10-NEXT: v_fma_f32 v4, 0xcf800000, v10, |v8|
+; GFX10-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX10-NEXT: v_xor_b32_e32 v3, v3, v7
+; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v0
+; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
-; GFX10-NEXT: v_ashrrev_i32_e32 v13, 31, v4
-; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v4
+; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10
+; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX10-NEXT: v_xor_b32_e32 v4, v12, v10
-; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7
-; GFX10-NEXT: v_xor_b32_e32 v5, v8, v10
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
-; GFX10-NEXT: v_xor_b32_e32 v6, v11, v13
-; GFX10-NEXT: v_xor_b32_e32 v7, v9, v13
-; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, v7
+; GFX10-NEXT: v_xor_b32_e32 v3, v12, v9
+; GFX10-NEXT: v_xor_b32_e32 v6, v6, v9
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v11, v7, vcc_lo
+; GFX10-NEXT: v_xor_b32_e32 v7, v10, v8
+; GFX10-NEXT: v_xor_b32_e32 v10, v2, v8
+; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v9
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v9, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v10, v8
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v8, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v4bf16_to_v4i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_trunc_f32_e32 v2, v2
-; GFX11-NEXT: v_trunc_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_trunc_f32_e32 v3, v3
-; GFX11-NEXT: v_trunc_f32_e32 v4, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2|
-; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3|
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v3|
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_trunc_f32_e32 v0, v0
+; GFX11-NEXT: v_trunc_f32_e32 v8, v1
; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GFX11-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4|
-; GFX11-NEXT: v_floor_f32_e32 v1, v1
-; GFX11-NEXT: v_floor_f32_e32 v6, v6
-; GFX11-NEXT: v_floor_f32_e32 v8, v8
-; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0
-; GFX11-NEXT: v_floor_f32_e32 v9, v9
+; GFX11-NEXT: v_floor_f32_e32 v1, v4
+; GFX11-NEXT: v_floor_f32_e32 v4, v6
+; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
+; GFX11-NEXT: v_mul_f32_e64 v10, 0x2f800000, |v8|
+; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v3
; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2|
-; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
-; GFX11-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT: v_fma_f32 v3, 0xcf800000, v8, |v3|
+; GFX11-NEXT: v_fma_f32 v3, 0xcf800000, v4, |v3|
+; GFX11-NEXT: v_floor_f32_e32 v6, v6
+; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GFX11-NEXT: v_floor_f32_e32 v10, v10
; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX11-NEXT: v_fma_f32 v11, 0xcf800000, v9, |v4|
-; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v0
+; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5
-; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v3
-; GFX11-NEXT: v_xor_b32_e32 v3, v0, v7
-; GFX11-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GFX11-NEXT: v_xor_b32_e32 v6, v6, v7
-; GFX11-NEXT: v_cvt_u32_f32_e32 v11, v11
+; GFX11-NEXT: v_xor_b32_e32 v11, v4, v7
+; GFX11-NEXT: v_fma_f32 v4, 0xcf800000, v10, |v8|
+; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX11-NEXT: v_xor_b32_e32 v3, v3, v7
+; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v0
+; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
-; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v4
-; GFX11-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v4
+; GFX11-NEXT: v_cvt_u32_f32_e32 v10, v10
+; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX11-NEXT: v_xor_b32_e32 v4, v12, v10
-; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7
-; GFX11-NEXT: v_xor_b32_e32 v5, v8, v10
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
-; GFX11-NEXT: v_xor_b32_e32 v6, v11, v13
-; GFX11-NEXT: v_xor_b32_e32 v7, v9, v13
-; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v3, v7
+; GFX11-NEXT: v_xor_b32_e32 v3, v12, v9
+; GFX11-NEXT: v_xor_b32_e32 v6, v6, v9
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v11, v7, vcc_lo
+; GFX11-NEXT: v_xor_b32_e32 v7, v10, v8
+; GFX11-NEXT: v_xor_b32_e32 v10, v2, v8
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v9
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v9, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_sub_co_u32 v6, vcc_lo, v10, v8
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v8, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <4 x bfloat> %x to <4 x i64>
ret <4 x i64> %op
@@ -29825,7 +33892,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_i16_to_bf16:
@@ -29833,7 +33901,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_i16_to_bf16:
@@ -29901,19 +33970,25 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16:
@@ -30007,28 +34082,36 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v3i16_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -30119,35 +34202,45 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GCN-LABEL: v_sitofp_v4i16_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_bfe_i32 v3, v3, 0, 16
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GCN-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16:
@@ -30300,14 +34393,16 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_i32_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_i32_to_bf16:
@@ -30371,17 +34466,23 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v2i32_to_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16:
@@ -30471,23 +34572,31 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GCN-LABEL: v_sitofp_v3i32_to_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v3i32_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16:
@@ -30579,27 +34688,37 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GCN-LABEL: v_sitofp_v4i32_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v4i32_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
@@ -30757,7 +34876,8 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_i64_to_bf16:
@@ -30775,7 +34895,8 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_i64_to_bf16:
@@ -30910,8 +35031,11 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v2i64_to_v2bf16:
@@ -30927,22 +35051,25 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v4
+; GFX7-NEXT: v_ffbh_i32_e32 v4, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, -1, v4
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
-; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
-; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT: v_min_u32_e32 v4, v4, v5
+; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
-; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4
-; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT: v_ldexp_f32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v4
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16:
@@ -31130,10 +35257,10 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GCN-LABEL: v_sitofp_v3i64_to_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_ffbh_i32_e32 v6, v5
-; GCN-NEXT: v_xor_b32_e32 v7, v4, v5
-; GCN-NEXT: v_ffbh_i32_e32 v8, v3
-; GCN-NEXT: v_xor_b32_e32 v9, v2, v3
+; GCN-NEXT: v_ffbh_i32_e32 v6, v3
+; GCN-NEXT: v_xor_b32_e32 v7, v2, v3
+; GCN-NEXT: v_ffbh_i32_e32 v8, v5
+; GCN-NEXT: v_xor_b32_e32 v9, v4, v5
; GCN-NEXT: v_ffbh_i32_e32 v10, v1
; GCN-NEXT: v_xor_b32_e32 v11, v0, v1
; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6
@@ -31148,71 +35275,79 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GCN-NEXT: v_min_u32_e32 v6, v6, v7
; GCN-NEXT: v_min_u32_e32 v7, v8, v9
; GCN-NEXT: v_min_u32_e32 v8, v10, v11
-; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6
-; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v7
; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
-; GCN-NEXT: v_min_u32_e32 v4, 1, v4
; GCN-NEXT: v_min_u32_e32 v2, 1, v2
+; GCN-NEXT: v_min_u32_e32 v4, 1, v4
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
-; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
+; GCN-NEXT: v_or_b32_e32 v3, v5, v4
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_cvt_f32_i32_e32 v1, v4
-; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_i32_e32 v1, v2
+; GCN-NEXT: v_cvt_f32_i32_e32 v2, v3
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6
-; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7
+; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v6
+; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v7
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v3i64_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_xor_b32_e32 v7, v4, v5
-; GFX7-NEXT: v_ffbh_i32_e32 v6, v5
-; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6
-; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7
-; GFX7-NEXT: v_min_u32_e32 v6, v6, v7
-; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
; GFX7-NEXT: v_xor_b32_e32 v7, v2, v3
-; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
; GFX7-NEXT: v_ffbh_i32_e32 v6, v3
; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7
-; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7
; GFX7-NEXT: v_min_u32_e32 v6, v6, v7
; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
-; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
-; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
-; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
-; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v6
+; GFX7-NEXT: v_ldexp_f32_e32 v6, v2, v3
+; GFX7-NEXT: v_xor_b32_e32 v3, v4, v5
+; GFX7-NEXT: v_ffbh_i32_e32 v2, v5
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, -1, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v3
+; GFX7-NEXT: v_min_u32_e32 v7, v2, v3
+; GFX7-NEXT: v_lshl_b64 v[2:3], v[4:5], v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_xor_b32_e32 v6, v0, v1
+; GFX7-NEXT: v_ffbh_i32_e32 v5, v1
+; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, -1, v5
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 32, v6
+; GFX7-NEXT: v_min_u32_e32 v5, v5, v6
+; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
-; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v7
+; GFX7-NEXT: v_ldexp_f32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v5
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v4, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16:
@@ -31403,12 +35538,12 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GCN-LABEL: v_sitofp_v4i64_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_ffbh_i32_e32 v8, v7
-; GCN-NEXT: v_xor_b32_e32 v9, v6, v7
-; GCN-NEXT: v_ffbh_i32_e32 v10, v5
-; GCN-NEXT: v_xor_b32_e32 v11, v4, v5
-; GCN-NEXT: v_ffbh_i32_e32 v12, v3
-; GCN-NEXT: v_xor_b32_e32 v13, v2, v3
+; GCN-NEXT: v_ffbh_i32_e32 v8, v3
+; GCN-NEXT: v_xor_b32_e32 v9, v2, v3
+; GCN-NEXT: v_ffbh_i32_e32 v10, v7
+; GCN-NEXT: v_xor_b32_e32 v11, v6, v7
+; GCN-NEXT: v_ffbh_i32_e32 v12, v5
+; GCN-NEXT: v_xor_b32_e32 v13, v4, v5
; GCN-NEXT: v_ffbh_i32_e32 v14, v1
; GCN-NEXT: v_xor_b32_e32 v15, v0, v1
; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8
@@ -31427,91 +35562,101 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GCN-NEXT: v_min_u32_e32 v9, v10, v11
; GCN-NEXT: v_min_u32_e32 v10, v12, v13
; GCN-NEXT: v_min_u32_e32 v11, v14, v15
-; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v8
; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
-; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9
+; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v9
; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9
-; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v10
; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11
; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11
+; GCN-NEXT: v_min_u32_e32 v2, 1, v2
; GCN-NEXT: v_min_u32_e32 v6, 1, v6
; GCN-NEXT: v_min_u32_e32 v4, 1, v4
-; GCN-NEXT: v_min_u32_e32 v2, 1, v2
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
-; GCN-NEXT: v_or_b32_e32 v6, v7, v6
-; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
+; GCN-NEXT: v_or_b32_e32 v3, v7, v6
+; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_cvt_f32_i32_e32 v1, v6
+; GCN-NEXT: v_cvt_f32_i32_e32 v1, v2
+; GCN-NEXT: v_cvt_f32_i32_e32 v2, v3
; GCN-NEXT: v_cvt_f32_i32_e32 v3, v4
-; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8
-; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9
-; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10
+; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v8
+; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v9
+; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v10
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v4i64_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_xor_b32_e32 v9, v6, v7
-; GFX7-NEXT: v_ffbh_i32_e32 v8, v7
-; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8
-; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9
-; GFX7-NEXT: v_min_u32_e32 v8, v8, v9
-; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
-; GFX7-NEXT: v_xor_b32_e32 v9, v4, v5
-; GFX7-NEXT: v_min_u32_e32 v6, 1, v6
-; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8
-; GFX7-NEXT: v_ffbh_i32_e32 v8, v5
+; GFX7-NEXT: v_xor_b32_e32 v9, v2, v3
+; GFX7-NEXT: v_ffbh_i32_e32 v8, v3
; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9
; GFX7-NEXT: v_min_u32_e32 v8, v8, v9
-; GFX7-NEXT: v_cvt_f32_i32_e32 v6, v6
-; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8
-; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8
-; GFX7-NEXT: v_xor_b32_e32 v8, v2, v3
-; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7
-; GFX7-NEXT: v_ffbh_i32_e32 v7, v3
-; GFX7-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4
-; GFX7-NEXT: v_add_i32_e32 v7, vcc, -1, v7
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v8
-; GFX7-NEXT: v_min_u32_e32 v7, v7, v8
-; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
-; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
+; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v8
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
-; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
-; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
-; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v8
+; GFX7-NEXT: v_ldexp_f32_e32 v8, v2, v3
+; GFX7-NEXT: v_xor_b32_e32 v3, v6, v7
+; GFX7-NEXT: v_ffbh_i32_e32 v2, v7
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, -1, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v3
+; GFX7-NEXT: v_min_u32_e32 v9, v2, v3
+; GFX7-NEXT: v_lshl_b64 v[2:3], v[6:7], v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v8
+; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: v_cvt_f32_i32_e32 v7, v2
+; GFX7-NEXT: v_xor_b32_e32 v3, v4, v5
+; GFX7-NEXT: v_ffbh_i32_e32 v2, v5
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, -1, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v3
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 32, v9
+; GFX7-NEXT: v_min_u32_e32 v9, v2, v3
+; GFX7-NEXT: v_lshl_b64 v[2:3], v[4:5], v9
+; GFX7-NEXT: v_ldexp_f32_e32 v4, v7, v8
+; GFX7-NEXT: v_xor_b32_e32 v7, v0, v1
+; GFX7-NEXT: v_ffbh_i32_e32 v5, v1
+; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, -1, v5
+; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7
+; GFX7-NEXT: v_min_u32_e32 v5, v5, v7
+; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7
-; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v9
+; GFX7-NEXT: v_ldexp_f32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v5
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v6, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16:
@@ -31839,7 +35984,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i16_to_bf16:
@@ -31847,7 +35993,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i16_to_bf16:
@@ -31915,19 +36062,25 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -32021,28 +36174,36 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -32135,34 +36296,44 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
@@ -32317,14 +36488,16 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i32_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i32_to_bf16:
@@ -32388,17 +36561,23 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -32488,23 +36667,31 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GCN-LABEL: v_uitofp_v3i32_to_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -32596,27 +36783,37 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GCN-LABEL: v_uitofp_v4i32_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
+; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
@@ -32770,7 +36967,8 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i64_to_bf16:
@@ -32784,7 +36982,8 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i64_to_bf16:
@@ -32893,8 +37092,11 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v2i64_to_v2bf16:
@@ -32903,21 +37105,24 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX7-NEXT: v_ffbh_u32_e32 v4, v3
; GFX7-NEXT: v_min_u32_e32 v4, 32, v4
; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
-; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
-; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
-; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v4
+; GFX7-NEXT: v_ffbh_u32_e32 v4, v1
+; GFX7-NEXT: v_min_u32_e32 v4, 32, v4
+; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4
-; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT: v_ldexp_f32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v4
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16:
@@ -33069,65 +37274,73 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GCN-LABEL: v_uitofp_v3i64_to_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_ffbh_u32_e32 v6, v5
-; GCN-NEXT: v_ffbh_u32_e32 v7, v3
+; GCN-NEXT: v_ffbh_u32_e32 v6, v3
+; GCN-NEXT: v_ffbh_u32_e32 v7, v5
; GCN-NEXT: v_ffbh_u32_e32 v8, v1
; GCN-NEXT: v_min_u32_e32 v6, 32, v6
; GCN-NEXT: v_min_u32_e32 v7, 32, v7
; GCN-NEXT: v_min_u32_e32 v8, 32, v8
-; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6
-; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v7
; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
-; GCN-NEXT: v_min_u32_e32 v4, 1, v4
; GCN-NEXT: v_min_u32_e32 v2, 1, v2
+; GCN-NEXT: v_min_u32_e32 v4, 1, v4
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
-; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
+; GCN-NEXT: v_or_b32_e32 v3, v5, v4
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_cvt_f32_u32_e32 v1, v4
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v2, v3
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6
-; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7
+; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v6
+; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v7
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v3i64_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_ffbh_u32_e32 v6, v5
-; GFX7-NEXT: v_min_u32_e32 v6, 32, v6
-; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
-; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
; GFX7-NEXT: v_ffbh_u32_e32 v6, v3
; GFX7-NEXT: v_min_u32_e32 v6, 32, v6
; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
-; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
-; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
-; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v6
+; GFX7-NEXT: v_ldexp_f32_e32 v6, v2, v3
+; GFX7-NEXT: v_ffbh_u32_e32 v2, v5
+; GFX7-NEXT: v_min_u32_e32 v7, 32, v2
+; GFX7-NEXT: v_lshl_b64 v[2:3], v[4:5], v7
+; GFX7-NEXT: v_ffbh_u32_e32 v5, v1
+; GFX7-NEXT: v_min_u32_e32 v5, 32, v5
+; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
-; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v7
+; GFX7-NEXT: v_ldexp_f32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v4, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -33282,83 +37495,93 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GCN-LABEL: v_uitofp_v4i64_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_ffbh_u32_e32 v8, v7
-; GCN-NEXT: v_ffbh_u32_e32 v9, v5
-; GCN-NEXT: v_ffbh_u32_e32 v10, v3
+; GCN-NEXT: v_ffbh_u32_e32 v8, v3
+; GCN-NEXT: v_ffbh_u32_e32 v9, v7
+; GCN-NEXT: v_ffbh_u32_e32 v10, v5
; GCN-NEXT: v_ffbh_u32_e32 v11, v1
; GCN-NEXT: v_min_u32_e32 v8, 32, v8
; GCN-NEXT: v_min_u32_e32 v9, 32, v9
; GCN-NEXT: v_min_u32_e32 v10, 32, v10
; GCN-NEXT: v_min_u32_e32 v11, 32, v11
-; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v8
; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
-; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9
+; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v9
; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9
-; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v10
; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11
; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11
+; GCN-NEXT: v_min_u32_e32 v2, 1, v2
; GCN-NEXT: v_min_u32_e32 v6, 1, v6
; GCN-NEXT: v_min_u32_e32 v4, 1, v4
-; GCN-NEXT: v_min_u32_e32 v2, 1, v2
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
-; GCN-NEXT: v_or_b32_e32 v6, v7, v6
-; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
+; GCN-NEXT: v_or_b32_e32 v3, v7, v6
+; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_cvt_f32_u32_e32 v1, v6
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v2, v3
; GCN-NEXT: v_cvt_f32_u32_e32 v3, v4
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8
-; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9
-; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10
+; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v8
+; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v9
+; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v10
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v4i64_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_ffbh_u32_e32 v8, v7
-; GFX7-NEXT: v_min_u32_e32 v8, 32, v8
-; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
-; GFX7-NEXT: v_min_u32_e32 v6, 1, v6
-; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
-; GFX7-NEXT: v_cvt_f32_u32_e32 v6, v6
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8
-; GFX7-NEXT: v_ffbh_u32_e32 v8, v5
-; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7
-; GFX7-NEXT: v_ffbh_u32_e32 v7, v3
-; GFX7-NEXT: v_min_u32_e32 v7, 32, v7
-; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
+; GFX7-NEXT: v_ffbh_u32_e32 v8, v3
; GFX7-NEXT: v_min_u32_e32 v8, 32, v8
+; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v8
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
-; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
-; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
-; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4
-; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v8
+; GFX7-NEXT: v_ldexp_f32_e32 v8, v2, v3
+; GFX7-NEXT: v_ffbh_u32_e32 v2, v7
+; GFX7-NEXT: v_min_u32_e32 v9, 32, v2
+; GFX7-NEXT: v_lshl_b64 v[2:3], v[6:7], v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v8
+; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: v_cvt_f32_u32_e32 v7, v2
+; GFX7-NEXT: v_ffbh_u32_e32 v2, v5
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 32, v9
+; GFX7-NEXT: v_min_u32_e32 v9, 32, v2
+; GFX7-NEXT: v_lshl_b64 v[2:3], v[4:5], v9
+; GFX7-NEXT: v_ffbh_u32_e32 v5, v1
+; GFX7-NEXT: v_min_u32_e32 v5, 32, v5
+; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8
-; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7
-; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v9
+; GFX7-NEXT: v_ldexp_f32_e32 v2, v2, v3
+; GFX7-NEXT: v_ldexp_f32_e32 v4, v7, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v5
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v6, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16:
@@ -33629,7 +37852,8 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_bf16:
@@ -33640,7 +37864,8 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_bf16:
@@ -33688,7 +37913,8 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_fneg_lhs_bf16:
@@ -33699,7 +37925,8 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, -1.0, v1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_fneg_lhs_bf16:
@@ -33752,7 +37979,8 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v2, -1.0, v2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_fneg_rhs_bf16:
@@ -33763,7 +37991,8 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v2, -1.0, v2
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_fneg_rhs_bf16:
@@ -33824,26 +38053,28 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v2, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX7-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v2bf16:
@@ -33914,8 +38145,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_vselect_v2bf16:
@@ -33930,9 +38164,12 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v2bf16:
@@ -34171,33 +38408,31 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
; GCN-LABEL: s_vselect_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
-; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2
-; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1
-; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1
+; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s3
+; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s0
+; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_vselect_v2bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1
-; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
-; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0
+; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s2
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1
+; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
@@ -34293,7 +38528,8 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v3bf16:
@@ -34317,7 +38553,8 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v3bf16:
@@ -34384,9 +38621,11 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v4bf16:
@@ -34413,9 +38652,11 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v4bf16:
@@ -34491,11 +38732,14 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v6bf16:
@@ -34531,11 +38775,14 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v6bf16:
@@ -34624,13 +38871,17 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v8bf16:
@@ -34675,13 +38926,17 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v8bf16:
@@ -34810,24 +39065,32 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v12
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16
; GCN-NEXT: v_cndmask_b32_e32 v15, v14, v15, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v16bf16:
@@ -34905,17 +39168,23 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
@@ -34924,9 +39193,11 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_alignbit_b32 v12, v12, v16, 16
; GFX7-NEXT: v_cndmask_b32_e32 v15, v12, v15, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v16bf16:
@@ -35193,52 +39464,68 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16
; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc
; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v13, v28, v13, vcc
; GCN-NEXT: v_cndmask_b32_e32 v27, v27, v12, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v26, v26, v11, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v11, v26, v11, vcc
; GCN-NEXT: v_cndmask_b32_e32 v25, v25, v10, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v24, v24, v9, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v9, v24, v9, vcc
; GCN-NEXT: v_cndmask_b32_e32 v23, v23, v8, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v22, v22, v7, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v13, v21, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v11, v20, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v4, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v7, v18, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v2, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v15, v0, vcc
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v22
+; GCN-NEXT: v_cndmask_b32_e32 v7, v22, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v14, v21, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v5, v20, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v10, v19, v4, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v3, v18, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v12, v17, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v15, v15, v0, vcc
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v23
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v23
-; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v24
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v25
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v25
-; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v11
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v27
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v27
-; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v28
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v13
; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v29
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v33
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v34
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v35
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v36
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v37
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v38
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v32bf16:
@@ -35339,7 +39626,8 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_waitcnt vmcnt(12)
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: s_waitcnt vmcnt(11)
@@ -35396,82 +39684,96 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
@@ -35483,7 +39785,8 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16
; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v32bf16:
@@ -35703,14 +40006,14 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
; GCN-LABEL: s_select_v4bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
-; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
-; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s5
-; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4
-; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3
-; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2
-; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7
-; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6
+; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s3
+; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s7
+; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s6
+; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s1
+; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s0
+; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s5
+; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s4
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
@@ -35720,35 +40023,39 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GCN-NEXT: v_readfirstlane_b32 s0, v1
-; GCN-NEXT: v_readfirstlane_b32 s1, v0
+; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v3, v0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s1, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_select_v4bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
+; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s3
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
+; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5
+; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s7
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4
+; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s6
; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
+; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2
+; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0
; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7
+; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6
+; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s4
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s0, v1
-; GFX7-NEXT: v_readfirstlane_b32 s1, v0
+; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX7-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_select_v4bf16:
@@ -35814,56 +40121,52 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
; GCN-LABEL: s_vselect_v4bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s0
-; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s4
-; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s1
-; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s5
-; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s2
-; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s6
-; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s3
-; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s7
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
+; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1
+; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s5
+; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s0
+; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s4
+; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s3
+; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s7
+; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s2
+; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s6
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v11, v10, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_or_b32_e32 v2, v2, v3
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: v_readfirstlane_b32 s1, v2
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_vselect_v4bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s3
-; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s7
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s2
-; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s6
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
+; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s2
+; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s6
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s1
-; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0
-; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s4
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s3
+; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s7
+; GFX7-NEXT: v_cndmask_b32_e32 v2, v11, v10, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s0
+; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s4
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1
+; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s5
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v2
; GFX7-NEXT: ; return to shader part epilog
@@ -35997,56 +40300,68 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v3, 1, v3
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v3, v3, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_vselect_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_bfe_u32 v3, v3, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v4bf16:
@@ -36155,104 +40470,128 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_and_b32_e32 v3, 1, v3
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v5, 1, v5
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v6, 1, v6
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_and_b32_e32 v1, 1, v1
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_and_b32_e32 v3, 1, v3
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_and_b32_e32 v5, 1, v5
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v7, 1, v7
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GCN-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GCN-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GCN-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GCN-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GCN-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v3, v3, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GCN-NEXT: v_bfe_u32 v7, v7, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_vselect_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_bfe_u32 v3, v3, 0, 16
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_bfe_u32 v7, v7, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v8bf16:
@@ -36442,103 +40781,103 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GCN-NEXT: v_writelane_b32 v31, s35, 3
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GCN-NEXT: v_and_b32_e32 v0, 1, v2
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GCN-NEXT: v_and_b32_e32 v0, 1, v4
; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v3
+; GCN-NEXT: v_and_b32_e32 v0, 1, v6
; GCN-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v4
+; GCN-NEXT: v_and_b32_e32 v0, 1, v8
; GCN-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v5
+; GCN-NEXT: v_and_b32_e32 v0, 1, v10
; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v6
+; GCN-NEXT: v_and_b32_e32 v0, 1, v12
; GCN-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v7
+; GCN-NEXT: v_and_b32_e32 v0, 1, v14
; GCN-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v8
+; GCN-NEXT: v_and_b32_e32 v0, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v9
+; GCN-NEXT: v_and_b32_e32 v0, 1, v3
; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16
-; GCN-NEXT: v_and_b32_e32 v1, 1, v10
+; GCN-NEXT: v_and_b32_e32 v1, 1, v5
; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v3, 1, v11
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18
+; GCN-NEXT: v_and_b32_e32 v3, 1, v7
; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v3
-; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v5, 1, v12
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v20
+; GCN-NEXT: v_and_b32_e32 v5, 1, v9
; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v5
-; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19
-; GCN-NEXT: v_and_b32_e32 v7, 1, v13
-; GCN-NEXT: v_and_b32_e32 v8, 1, v14
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v22
+; GCN-NEXT: v_and_b32_e32 v7, 1, v11
+; GCN-NEXT: v_and_b32_e32 v8, 1, v13
; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v8
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
; GCN-NEXT: v_and_b32_e32 v9, 1, v15
; GCN-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v9
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[34:35]
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
+; GCN-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[34:35]
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[30:31]
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v29
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_cndmask_b32_e64 v13, v7, v9, s[28:29]
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v28
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[26:27]
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[30:31]
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:40
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_cndmask_b32_e64 v11, v7, v9, s[24:25]
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v9, s[28:29]
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_cndmask_b32_e64 v10, v8, v9, s[22:23]
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[26:27]
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[20:21]
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[24:25]
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[18:19]
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[22:23]
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[16:17]
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[20:21]
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v16, s[18:19]
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v30
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[16:17]
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v18, s[14:15]
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13]
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
@@ -36551,22 +40890,46 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7]
; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_bfe_u32 v1, v22, 0, 16
+; GCN-NEXT: v_bfe_u32 v3, v13, 0, 16
+; GCN-NEXT: v_bfe_u32 v5, v21, 0, 16
+; GCN-NEXT: v_bfe_u32 v13, v20, 0, 16
+; GCN-NEXT: v_bfe_u32 v11, v11, 0, 16
+; GCN-NEXT: v_bfe_u32 v15, v9, 0, 16
+; GCN-NEXT: v_bfe_u32 v16, v18, 0, 16
+; GCN-NEXT: v_bfe_u32 v17, v7, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v17
; GCN-NEXT: v_readlane_b32 s35, v31, 3
; GCN-NEXT: v_readlane_b32 s34, v31, 2
; GCN-NEXT: v_readlane_b32 s31, v31, 1
@@ -36585,142 +40948,166 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v4
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v6
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v8
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v5
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v10
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v6
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v12
; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v7
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v14
; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v9
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v3
; GFX7-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v10
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v5
; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v11
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v7
; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v12
; GFX7-NEXT: v_writelane_b32 v31, s30, 0
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v13
; GFX7-NEXT: v_writelane_b32 v31, s31, 1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_and_b32_e32 v4, 1, v14
; GFX7-NEXT: v_writelane_b32 v31, s34, 2
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_and_b32_e32 v2, 1, v9
; GFX7-NEXT: v_and_b32_e32 v5, 1, v15
; GFX7-NEXT: v_writelane_b32 v31, s35, 3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_and_b32_e32 v3, 1, v11
+; GFX7-NEXT: v_and_b32_e32 v4, 1, v13
; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5
-; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4
+; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:40
+; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v29
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v21
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v30
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v22
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[34:35]
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[34:35]
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX7-NEXT: v_readlane_b32 s35, v31, 3
+; GFX7-NEXT: v_readlane_b32 s34, v31, 2
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[30:31]
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v29
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[30:31]
+; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v2
+; GFX7-NEXT: v_readlane_b32 s31, v31, 1
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v13, v3, v2, s[28:29]
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v28
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[28:29]
+; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v3, s[26:27]
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v27
-; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v4, s[24:25]
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_readlane_b32 s35, v31, 3
-; GFX7-NEXT: v_readlane_b32 s34, v31, 2
-; GFX7-NEXT: v_readlane_b32 s31, v31, 1
+; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[26:27]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[24:25]
+; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v4
; GFX7-NEXT: v_readlane_b32 s30, v31, 0
; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23]
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25
-; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v9, v1, v5, s[20:21]
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[22:23]
+; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v8, v2, v5, s[18:19]
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[20:21]
+; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_bfe_u32 v3, v3, 0, 16
; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v7, v3, v5, s[16:17]
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[18:19]
+; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v6, v4, v5, s[14:15]
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[16:17]
+; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:20
; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13]
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[14:15]
+; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[12:13]
+; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v17, s[4:5]
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v18, s[6:7]
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v19, s[8:9]
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11]
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v16, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v15, v20, s[10:11]
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v14, v19, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v13, v18, s[6:7]
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v12, v17, s[4:5]
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v11, v16, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v7
+; GFX7-NEXT: v_bfe_u32 v7, v25, 0, 16
+; GFX7-NEXT: v_bfe_u32 v9, v24, 0, 16
+; GFX7-NEXT: v_bfe_u32 v11, v23, 0, 16
+; GFX7-NEXT: v_bfe_u32 v13, v22, 0, 16
+; GFX7-NEXT: v_bfe_u32 v15, v21, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
@@ -37075,657 +41462,757 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GCN-LABEL: v_vselect_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
-; GCN-NEXT: v_and_b32_e32 v36, 1, v13
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180
-; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56
-; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188
-; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
-; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192
-; GCN-NEXT: v_and_b32_e32 v53, 1, v26
-; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84
-; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88
-; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92
-; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96
-; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100
-; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104
-; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108
-; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112
-; GCN-NEXT: v_and_b32_e32 v27, 1, v27
-; GCN-NEXT: v_and_b32_e32 v28, 1, v28
-; GCN-NEXT: v_and_b32_e32 v29, 1, v29
-; GCN-NEXT: v_and_b32_e32 v30, 1, v30
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_and_b32_e32 v36, 1, v26
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228
+; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108
+; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:236
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:244
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:124
+; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:252
+; GCN-NEXT: v_and_b32_e32 v53, 1, v21
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72
+; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80
+; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_and_b32_e32 v55, 1, v23
+; GCN-NEXT: v_and_b32_e32 v41, 1, v25
+; GCN-NEXT: v_and_b32_e32 v25, 1, v27
+; GCN-NEXT: v_and_b32_e32 v23, 1, v29
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:252
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:248
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:244
-; GCN-NEXT: s_waitcnt expcnt(6)
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:240
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:248
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:240
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:232
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:224
; GCN-NEXT: s_waitcnt vmcnt(14)
-; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v37
-; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38
+; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v37
+; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v38
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v36
; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v43
+; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v42
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v44
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v30
-; GCN-NEXT: v_cndmask_b32_e64 v30, v37, v36, s[4:5]
-; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:236
+; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v46
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v23
+; GCN-NEXT: v_cndmask_b32_e64 v23, v37, v36, s[4:5]
+; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:216
+; GCN-NEXT: s_waitcnt expcnt(6)
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:208
; GCN-NEXT: s_waitcnt expcnt(5)
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:232
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:200
; GCN-NEXT: s_waitcnt expcnt(4)
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:228
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:192
; GCN-NEXT: s_waitcnt expcnt(3)
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:224
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:184
; GCN-NEXT: s_waitcnt expcnt(2)
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:220
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:176
; GCN-NEXT: s_waitcnt expcnt(1)
-; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:216
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212
-; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128
-; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:168
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(10)
-; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v45
-; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v45
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(9)
-; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v46
-; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55
+; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40
; GCN-NEXT: s_waitcnt vmcnt(8)
-; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v56
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v29
-; GCN-NEXT: v_cndmask_b32_e64 v29, v43, v42, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28
-; GCN-NEXT: v_cndmask_b32_e64 v28, v44, v41, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v27
-; GCN-NEXT: v_cndmask_b32_e64 v27, v45, v55, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v25
+; GCN-NEXT: v_cndmask_b32_e64 v25, v37, v29, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v41
+; GCN-NEXT: v_cndmask_b32_e64 v27, v38, v27, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v55
+; GCN-NEXT: v_cndmask_b32_e64 v29, v45, v40, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v53
; GCN-NEXT: v_cndmask_b32_e64 v36, v36, v54, s[4:5]
-; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4
-; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132
-; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144
-; GCN-NEXT: v_and_b32_e32 v3, 1, v3
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_and_b32_e32 v5, 1, v5
-; GCN-NEXT: v_and_b32_e32 v6, 1, v6
-; GCN-NEXT: v_and_b32_e32 v18, 1, v18
-; GCN-NEXT: v_and_b32_e32 v22, 1, v22
-; GCN-NEXT: v_and_b32_e32 v23, 1, v23
-; GCN-NEXT: v_and_b32_e32 v24, 1, v24
-; GCN-NEXT: v_and_b32_e32 v25, 1, v25
-; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
+; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:132
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:148
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156
+; GCN-NEXT: v_and_b32_e32 v37, 1, v6
+; GCN-NEXT: v_and_b32_e32 v38, 1, v8
+; GCN-NEXT: v_and_b32_e32 v53, 1, v10
+; GCN-NEXT: v_and_b32_e32 v12, 1, v12
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v63, 1, v5
+; GCN-NEXT: v_and_b32_e32 v10, 1, v13
+; GCN-NEXT: v_and_b32_e32 v8, 1, v15
+; GCN-NEXT: v_and_b32_e32 v6, 1, v17
+; GCN-NEXT: v_and_b32_e32 v5, 1, v19
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v52
; GCN-NEXT: s_waitcnt vmcnt(14)
-; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v56
-; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v57
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v51
; GCN-NEXT: s_waitcnt vmcnt(13)
-; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v57
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v58
; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
; GCN-NEXT: s_waitcnt vmcnt(12)
-; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v58
+; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v59
; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
; GCN-NEXT: s_waitcnt vmcnt(11)
-; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v59
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v25
-; GCN-NEXT: v_cndmask_b32_e64 v25, v46, v52, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24
-; GCN-NEXT: v_cndmask_b32_e64 v24, v47, v51, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v23
-; GCN-NEXT: v_cndmask_b32_e64 v23, v56, v50, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v22
-; GCN-NEXT: v_cndmask_b32_e64 v22, v57, v49, s[4:5]
-; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68
-; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196
-; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72
-; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:208
-; GCN-NEXT: v_and_b32_e32 v19, 1, v19
-; GCN-NEXT: v_and_b32_e32 v20, 1, v20
-; GCN-NEXT: v_and_b32_e32 v21, 1, v21
-; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48
+; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v60
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v5
+; GCN-NEXT: v_cndmask_b32_e64 v5, v15, v13, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6
+; GCN-NEXT: v_cndmask_b32_e64 v6, v19, v17, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
+; GCN-NEXT: v_cndmask_b32_e64 v8, v51, v50, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10
+; GCN-NEXT: v_cndmask_b32_e64 v10, v52, v49, s[4:5]
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136
+; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:144
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152
+; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:160
+; GCN-NEXT: v_and_b32_e32 v57, 1, v7
+; GCN-NEXT: v_and_b32_e32 v9, 1, v9
+; GCN-NEXT: v_and_b32_e32 v7, 1, v11
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v48
; GCN-NEXT: s_waitcnt vmcnt(14)
-; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v60
+; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v61
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
-; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v61
+; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v62
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v21
-; GCN-NEXT: v_cndmask_b32_e64 v21, v58, v48, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20
-; GCN-NEXT: v_cndmask_b32_e64 v20, v59, v39, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v19
-; GCN-NEXT: v_cndmask_b32_e64 v19, v57, v56, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18
-; GCN-NEXT: v_cndmask_b32_e64 v18, v47, v46, s[4:5]
-; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
-; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:148
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160
-; GCN-NEXT: v_and_b32_e32 v7, 1, v7
-; GCN-NEXT: v_and_b32_e32 v8, 1, v8
-; GCN-NEXT: v_and_b32_e32 v9, 1, v9
-; GCN-NEXT: v_and_b32_e32 v10, 1, v10
-; GCN-NEXT: v_and_b32_e32 v14, 1, v14
-; GCN-NEXT: v_and_b32_e32 v15, 1, v15
-; GCN-NEXT: v_and_b32_e32 v16, 1, v16
-; GCN-NEXT: v_and_b32_e32 v17, 1, v17
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
-; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
-; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
-; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17
-; GCN-NEXT: v_cndmask_b32_e64 v17, v52, v51, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
-; GCN-NEXT: v_cndmask_b32_e64 v16, v50, v49, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v15
-; GCN-NEXT: v_cndmask_b32_e64 v15, v35, v34, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7
+; GCN-NEXT: v_cndmask_b32_e64 v7, v48, v11, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9
+; GCN-NEXT: v_cndmask_b32_e64 v9, v58, v39, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v57
+; GCN-NEXT: v_cndmask_b32_e64 v11, v52, v51, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v63
+; GCN-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[4:5]
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:164
+; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:172
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:180
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188
+; GCN-NEXT: v_and_b32_e32 v17, 1, v14
+; GCN-NEXT: v_and_b32_e32 v61, 1, v16
+; GCN-NEXT: v_and_b32_e32 v18, 1, v18
+; GCN-NEXT: v_and_b32_e32 v20, 1, v20
+; GCN-NEXT: v_and_b32_e32 v16, 1, v28
+; GCN-NEXT: v_and_b32_e32 v14, 1, v30
+; GCN-NEXT: v_and_b32_e32 v28, 1, v1
+; GCN-NEXT: v_and_b32_e32 v1, 1, v3
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v34
+; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v50
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, v35, v19, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28
+; GCN-NEXT: v_cndmask_b32_e64 v3, v15, v3, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14
-; GCN-NEXT: v_cndmask_b32_e64 v14, v33, v32, s[4:5]
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164
-; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40
-; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168
-; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44
-; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172
-; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48
-; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176
-; GCN-NEXT: v_and_b32_e32 v11, 1, v11
-; GCN-NEXT: v_and_b32_e32 v12, 1, v12
-; GCN-NEXT: v_cndmask_b32_e32 v38, v38, v40, vcc
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256
-; GCN-NEXT: v_and_b32_e32 v26, 1, v26
-; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53
+; GCN-NEXT: v_cndmask_b32_e64 v14, v34, v33, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
+; GCN-NEXT: v_cndmask_b32_e64 v15, v32, v30, s[4:5]
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196
+; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212
+; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92
+; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220
+; GCN-NEXT: v_and_b32_e32 v22, 1, v22
+; GCN-NEXT: v_and_b32_e32 v24, 1, v24
+; GCN-NEXT: v_cndmask_b32_e32 v16, v44, v43, vcc
+; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:256
+; GCN-NEXT: v_and_b32_e32 v21, 1, v21
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54
; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55
+; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41
-; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42
-; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43
-; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44
-; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45
+; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v45
+; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v46
+; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v56
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48
-; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46
+; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
; GCN-NEXT: s_waitcnt vmcnt(13)
-; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
; GCN-NEXT: s_waitcnt vmcnt(12)
-; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56
+; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v57
; GCN-NEXT: s_waitcnt vmcnt(11)
-; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57
+; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v58
; GCN-NEXT: s_waitcnt vmcnt(10)
-; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v58
+; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v59
; GCN-NEXT: s_waitcnt vmcnt(9)
-; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v59
+; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v60
; GCN-NEXT: s_waitcnt vmcnt(8)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(7)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
+; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37
+; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24
+; GCN-NEXT: v_cndmask_b32_e32 v24, v31, v26, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22
+; GCN-NEXT: v_cndmask_b32_e32 v22, v49, v35, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20
+; GCN-NEXT: v_cndmask_b32_e32 v20, v34, v33, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18
+; GCN-NEXT: v_cndmask_b32_e32 v18, v32, v30, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v61
+; GCN-NEXT: v_cndmask_b32_e32 v19, v28, v19, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
+; GCN-NEXT: v_cndmask_b32_e32 v17, v58, v57, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
-; GCN-NEXT: v_cndmask_b32_e32 v12, v31, v13, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
-; GCN-NEXT: v_cndmask_b32_e32 v11, v52, v51, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
-; GCN-NEXT: v_cndmask_b32_e32 v10, v50, v49, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
-; GCN-NEXT: v_cndmask_b32_e32 v9, v35, v34, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
-; GCN-NEXT: v_cndmask_b32_e32 v8, v33, v32, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GCN-NEXT: v_cndmask_b32_e32 v7, v59, v58, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GCN-NEXT: v_cndmask_b32_e32 v6, v57, v56, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GCN-NEXT: v_cndmask_b32_e32 v5, v47, v46, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v12, v56, v47, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v53
+; GCN-NEXT: v_cndmask_b32_e32 v26, v52, v51, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v38
+; GCN-NEXT: v_cndmask_b32_e32 v28, v48, v39, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v37
+; GCN-NEXT: v_cndmask_b32_e32 v30, v46, v45, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: v_cndmask_b32_e32 v4, v48, v39, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GCN-NEXT: v_cndmask_b32_e32 v3, v45, v44, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v4, v44, v43, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v43, v42, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v41, v55, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v41, v40, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v54, v53, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26
-; GCN-NEXT: v_cndmask_b32_e32 v31, v40, v37, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: v_cndmask_b32_e32 v0, v55, v54, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21
+; GCN-NEXT: v_cndmask_b32_e32 v21, v50, v42, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v36
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v36
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35
+; GCN-NEXT: v_bfe_u32 v3, v3, 0, 16
+; GCN-NEXT: v_bfe_u32 v15, v1, 0, 16
+; GCN-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GCN-NEXT: v_bfe_u32 v11, v11, 0, 16
+; GCN-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GCN-NEXT: v_bfe_u32 v19, v7, 0, 16
+; GCN-NEXT: v_bfe_u32 v21, v34, 0, 16
+; GCN-NEXT: v_bfe_u32 v33, v33, 0, 16
+; GCN-NEXT: v_bfe_u32 v32, v32, 0, 16
+; GCN-NEXT: v_bfe_u32 v34, v5, 0, 16
+; GCN-NEXT: v_bfe_u32 v31, v31, 0, 16
+; GCN-NEXT: v_bfe_u32 v29, v29, 0, 16
+; GCN-NEXT: v_bfe_u32 v27, v27, 0, 16
+; GCN-NEXT: v_bfe_u32 v35, v25, 0, 16
+; GCN-NEXT: v_bfe_u32 v36, v23, 0, 16
+; GCN-NEXT: v_bfe_u32 v37, v17, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v33
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v34
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v35
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v36
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_vselect_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v24, 1, v24
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228
+; GFX7-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200
+; GFX7-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v19
+; GFX7-NEXT: v_and_b32_e32 v29, 1, v29
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v29
+; GFX7-NEXT: v_and_b32_e32 v21, 1, v21
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v21
+; GFX7-NEXT: v_and_b32_e32 v27, 1, v27
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v27
+; GFX7-NEXT: v_and_b32_e32 v23, 1, v23
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v23
; GFX7-NEXT: v_and_b32_e32 v25, 1, v25
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v25
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v25
+; GFX7-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX7-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX7-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX7-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_and_b32_e32 v30, 1, v30
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v30
-; GFX7-NEXT: v_and_b32_e32 v29, 1, v29
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v29
; GFX7-NEXT: v_and_b32_e32 v28, 1, v28
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v28
-; GFX7-NEXT: v_and_b32_e32 v27, 1, v27
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v27
; GFX7-NEXT: v_and_b32_e32 v26, 1, v26
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v26
-; GFX7-NEXT: v_and_b32_e32 v23, 1, v23
+; GFX7-NEXT: v_and_b32_e32 v24, 1, v24
; GFX7-NEXT: v_and_b32_e32 v22, 1, v22
-; GFX7-NEXT: v_and_b32_e32 v21, 1, v21
; GFX7-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX7-NEXT: v_and_b32_e32 v19, 1, v19
; GFX7-NEXT: v_and_b32_e32 v18, 1, v18
-; GFX7-NEXT: v_and_b32_e32 v17, 1, v17
; GFX7-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX7-NEXT: v_and_b32_e32 v15, 1, v15
; GFX7-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX7-NEXT: v_and_b32_e32 v13, 1, v13
; GFX7-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX7-NEXT: v_and_b32_e32 v11, 1, v11
; GFX7-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX7-NEXT: v_and_b32_e32 v9, 1, v9
; GFX7-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252
+; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248
+; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:240
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:232
+; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:216
+; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_and_b32_e32 v24, 1, v24
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124
-; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120
+; GFX7-NEXT: s_waitcnt vmcnt(8)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_cndmask_b32_e64 v30, v25, v24, s[12:13]
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[12:13]
+; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[14:15]
+; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_cndmask_b32_e64 v21, v23, v21, s[16:17]
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96
+; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_cndmask_b32_e64 v23, v25, v23, s[10:11]
+; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_bfe_u32 v23, v23, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_cndmask_b32_e64 v29, v25, v24, s[14:15]
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_cndmask_b32_e64 v25, v27, v25, s[8:9]
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v25
+; GFX7-NEXT: v_bfe_u32 v21, v21, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_cndmask_b32_e64 v27, v29, v27, s[6:7]
+; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:128
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v27
+; GFX7-NEXT: v_bfe_u32 v19, v19, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_cndmask_b32_e64 v29, v31, v29, s[4:5]
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_cndmask_b32_e64 v28, v25, v24, s[16:17]
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v15, v32, v15, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_bfe_u32 v15, v15, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_cndmask_b32_e64 v27, v25, v24, s[10:11]
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v13, v32, v13, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
+; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_cndmask_b32_e64 v26, v25, v24, s[8:9]
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v11, v32, v11, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:168
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_bfe_u32 v11, v11, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[6:7]
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v24, s[4:5]
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v32, v7, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_bfe_u32 v7, v7, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_cndmask_b32_e32 v24, v32, v24, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23
-; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v3, v3, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v32, v1, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v30, v32, v30, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v28
+; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v30
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v25
+; GFX7-NEXT: v_bfe_u32 v25, v34, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v23, v32, v23, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v28, v32, v28, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26
+; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX7-NEXT: v_bfe_u32 v27, v33, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v26, v32, v26, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24
+; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228
+; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_cndmask_b32_e32 v24, v32, v24, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22
; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v22, v32, v22, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21
-; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v21, v32, v21, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20
; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v20, v32, v20, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v19, v32, v19, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18
; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v18, v32, v18, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
-; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v32, v17, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v15, v32, v15, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v14, v32, v14, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v13, v32, v13, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:180
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v11, v32, v11, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:168
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v8, v32, v8, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v7, v32, v7, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:156
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v6, v32, v6, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v4, v32, v4, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v2, v32, v2, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v32, v1, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v17, v17, 0, 16
+; GFX7-NEXT: v_bfe_u32 v29, v29, 0, 16
+; GFX7-NEXT: v_bfe_u32 v31, v32, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v32bf16:
@@ -38705,11 +43192,15 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_fma_f32 v0, v0, v1, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_bf16:
@@ -38718,11 +43209,15 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_bf16:
@@ -38803,16 +43298,25 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_fma_f32 v1, v1, v3, v5
; GCN-NEXT: v_fma_f32 v0, v0, v2, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v2bf16:
@@ -38824,16 +43328,25 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_fma_f32 v0, v0, v2, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2bf16:
@@ -38844,10 +43357,13 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
@@ -38869,10 +43385,13 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
@@ -38893,13 +43412,16 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4
-; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_fmac_f32_e32 v3, v0, v1
+; GFX10-NEXT: v_fmac_f32_e32 v2, v5, v4
; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
@@ -38916,27 +43438,33 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX11-LABEL: v_fma_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_fmac_f32_e32 v2, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_fmac_f32_e32 v3, v0, v1
; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v0, v3, 16, 1
; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
@@ -38950,56 +43478,82 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_fma_f32 v2, v2, v5, v8
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_fma_f32 v1, v1, v4, v7
+; GCN-NEXT: v_fma_f32 v2, v2, v5, v8
; GCN-NEXT: v_fma_f32 v0, v0, v3, v6
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_fma_f32 v2, v2, v5, v8
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_fma_f32 v1, v1, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_fma_f32 v2, v2, v5, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_fma_f32 v0, v0, v3, v4
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v3bf16:
@@ -39020,11 +43574,14 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
; GFX8-NEXT: v_fma_f32 v3, v6, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
@@ -39057,10 +43614,13 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
@@ -39081,17 +43641,20 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_fmac_f32_e32 v6, v0, v2
; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX10-NEXT: v_fmac_f32_e32 v4, v8, v7
; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -39121,72 +43684,106 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_fma_f32 v1, v1, v5, v9
; GCN-NEXT: v_fma_f32 v3, v3, v7, v11
; GCN-NEXT: v_fma_f32 v2, v2, v6, v10
-; GCN-NEXT: v_fma_f32 v1, v1, v5, v9
; GCN-NEXT: v_fma_f32 v0, v0, v4, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_fma_f32 v3, v3, v7, v11
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_fma_f32 v2, v2, v6, v7
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_fma_f32 v1, v1, v5, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_fma_f32 v3, v3, v7, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_fma_f32 v2, v2, v6, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_fma_f32 v0, v0, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v4bf16:
@@ -39197,10 +43794,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v1
; GFX8-NEXT: v_fma_f32 v6, v8, v7, v6
; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
@@ -39218,10 +43818,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX8-NEXT: v_fma_f32 v3, v7, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
@@ -39245,10 +43848,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
@@ -39264,10 +43870,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
@@ -39290,21 +43899,27 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8
+; GFX10-NEXT: v_fmac_f32_e32 v7, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v6
-; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX10-NEXT: v_fmac_f32_e32 v4, v9, v8
; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1
; GFX10-NEXT: v_bfe_u32 v3, v7, 16, 1
@@ -39330,48 +43945,57 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX11-LABEL: v_fma_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_fmac_f32_e32 v5, v1, v3
; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v6
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_fmac_f32_e32 v4, v0, v2
-; GFX11-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_fmac_f32_e32 v7, v0, v2
; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
-; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v8
-; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
+; GFX11-NEXT: v_fmac_f32_e32 v4, v9, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX11-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v7
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
@@ -39390,13 +44014,18 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmuladd_bf16:
@@ -39404,14 +44033,19 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmuladd_bf16:
@@ -39426,7 +44060,8 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
@@ -39450,7 +44085,8 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
@@ -39473,7 +44109,8 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
@@ -39497,15 +44134,17 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
@@ -39522,45 +44161,67 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_add_f32_e32 v1, v1, v5
; GCN-NEXT: v_add_f32_e32 v0, v0, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmuladd_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmuladd_v2bf16:
@@ -39575,14 +44236,17 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
@@ -39594,8 +44258,10 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
@@ -39619,11 +44285,14 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
@@ -39635,8 +44304,10 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
@@ -39650,84 +44321,97 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX10-LABEL: v_fmuladd_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v1, v4, v3
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmuladd_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
ret <2 x bfloat> %op
@@ -39740,68 +44424,102 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_add_f32_e32 v2, v2, v8
-; GCN-NEXT: v_add_f32_e32 v1, v1, v7
; GCN-NEXT: v_add_f32_e32 v0, v0, v6
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_add_f32_e32 v1, v1, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmuladd_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmuladd_v3bf16:
@@ -39816,7 +44534,8 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
@@ -39835,13 +44554,16 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
@@ -39853,8 +44575,10 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
@@ -39879,7 +44603,8 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
@@ -39895,11 +44620,14 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
@@ -39911,8 +44639,10 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
@@ -39930,51 +44660,57 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX10-NEXT: v_add3_u32 v7, v9, v2, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
-; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -39989,88 +44725,130 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_add_f32_e32 v1, v1, v9
; GCN-NEXT: v_add_f32_e32 v3, v3, v11
; GCN-NEXT: v_add_f32_e32 v2, v2, v10
-; GCN-NEXT: v_add_f32_e32 v1, v1, v9
; GCN-NEXT: v_add_f32_e32 v0, v0, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmuladd_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v9
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v11
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v8
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmuladd_v4bf16:
@@ -40085,14 +44863,17 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
; GFX8-NEXT: v_add_f32_e32 v6, v6, v7
; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
@@ -40104,8 +44885,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
@@ -40122,13 +44905,16 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
@@ -40140,8 +44926,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
@@ -40167,11 +44955,14 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_f32_e32 v6, v6, v7
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
@@ -40183,8 +44974,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
@@ -40199,11 +44992,14 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
@@ -40215,8 +45011,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
@@ -40233,147 +45031,171 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
-; GFX10-NEXT: v_mul_f32_e32 v7, v9, v7
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v11, v0, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX10-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v8, v9, v1, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo
+; GFX10-NEXT: v_add3_u32 v3, v9, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v7
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_add3_u32 v11, v11, v0, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo
+; GFX10-NEXT: v_add3_u32 v9, v10, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT: v_add3_u32 v4, v7, v3, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v6
+; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add3_u32 v5, v7, v3, 0x7fff
; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
-; GFX10-NEXT: v_add3_u32 v5, v7, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmuladd_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_mul_f32_e32 v7, v9, v7
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT: v_dual_mul_f32 v7, v8, v7 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX11-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v11, v0, 16, 1
-; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v11, v11, v0, 0x7fff
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-NEXT: v_add3_u32 v8, v9, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo
+; GFX11-NEXT: v_add3_u32 v3, v9, v7, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v9, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-NEXT: v_add3_u32 v9, v10, v0, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v9, v10 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v6
; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v4, v7, v3, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v7, v2, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_add3_u32 v5, v7, v3, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
ret <4 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
index 0f40576a7459cc..807084f63a08e3 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
@@ -286,8 +286,9 @@ define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %o
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000
+; GCN-NEXT: s_lshr_b32 s3, s3, 16
; GCN-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NEXT: s_lshl_b32 s3, s3, 16
; GCN-NEXT: s_or_b32 s2, s2, s3
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s4, s0
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index e4c7df385d8619..35b8ce73f98c97 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -618,6 +618,7 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
; SI-NEXT: v_bfi_b32 v1, s4, v1, v3
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_bfe_u32 v1, v1, 0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bswap_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index 46b2f82d9de2a3..c5be962e231706 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -15,6 +15,7 @@ define void @undef_lo_v2i16(i16 %arg0) {
; GFX8-LABEL: undef_lo_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v0
@@ -38,6 +39,7 @@ define void @undef_lo_v2f16(half %arg0) {
; GFX8-LABEL: undef_lo_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v0
@@ -115,6 +117,7 @@ define void @undef_lo3_v4i16(i16 %arg0) {
; GFX8-LABEL: undef_lo3_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v[0:1]
@@ -138,6 +141,7 @@ define void @undef_lo3_v4f16(half %arg0) {
; GFX8-LABEL: undef_lo3_v4f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index a693e13f37ea36..b01362605ea34d 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -472,7 +472,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 0, @10, KC0[], KC1[]
; R600-NEXT: TEX 1 @6
-; R600-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
+; R600-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
@@ -482,9 +482,12 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; R600-NEXT: ALU clause starting at 10:
; R600-NEXT: MOV * T0.X, 0.0,
; R600-NEXT: ALU clause starting at 11:
-; R600-NEXT: LSHL * T0.Y, T1.X, literal.x,
-; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; R600-NEXT: LSHL T0.X, T0.X, literal.x,
+; R600-NEXT: AND_INT * T0.W, T1.X, literal.x,
+; R600-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; R600-NEXT: LSHL T0.Y, PV.W, literal.x,
+; R600-NEXT: AND_INT * T0.W, T0.X, literal.y,
+; R600-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
+; R600-NEXT: LSHL T0.X, PV.W, literal.x,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; R600-NEXT: 16(2.242078e-44), 2(2.802597e-45)
;
@@ -493,6 +496,8 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_mov_b32 s6, -1
@@ -511,10 +516,12 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
-; GFX8-NEXT: s_lshl_b32 s0, s3, 16
-; GFX8-NEXT: s_lshl_b32 s1, s2, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: s_and_b32 s0, s2, 0xffff
+; GFX8-NEXT: s_and_b32 s1, s3, 0xffff
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
@@ -523,6 +530,8 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
; GFX10-NEXT: s_lshl_b32 s2, s2, 16
; GFX10-NEXT: s_lshl_b32 s3, s3, 16
; GFX10-NEXT: v_mov_b32_e32 v0, s2
@@ -535,6 +544,8 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-NEXT: s_lshl_b32 s2, s2, 16
; GFX11-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-NEXT: v_mov_b32_e32 v0, s2
@@ -549,6 +560,8 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX940-NEXT: s_and_b32 s3, s3, 0xffff
; GFX940-NEXT: s_lshl_b32 s3, s3, 16
; GFX940-NEXT: s_lshl_b32 s2, s2, 16
; GFX940-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 4d8687b141a79a..b820b8b46a6eca 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -16,110 +16,116 @@ define i64 @sdiv64(i64 %a, i64 %b) {
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB0_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
-; GFX9-NEXT: v_xor_b32_e32 v10, v3, v9
-; GFX9-NEXT: v_xor_b32_e32 v11, v2, v9
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v11
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v10
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v11
-; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT: v_xor_b32_e32 v7, v3, v4
+; GFX9-NEXT: v_xor_b32_e32 v8, v2, v4
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v8
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v7
+; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v8
+; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v7, vcc
; GFX9-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
; GFX9-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_madmk_f32 v2, v3, 0xcf800000, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v3
-; GFX9-NEXT: v_mul_lo_u32 v4, v8, v6
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
-; GFX9-NEXT: v_mul_lo_u32 v5, v7, v12
-; GFX9-NEXT: v_mul_hi_u32 v13, v6, v2
-; GFX9-NEXT: v_add3_u32 v5, v3, v5, v4
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
-; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3
+; GFX9-NEXT: v_mul_lo_u32 v5, v11, v9
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_mul_lo_u32 v6, v10, v12
+; GFX9-NEXT: v_add3_u32 v13, v3, v6, v5
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v13, 0
+; GFX9-NEXT: v_mul_hi_u32 v3, v9, v2
+; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, v3, v5
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v13, 0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v15, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v6, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v2, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v2
; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, v7, v12
-; GFX9-NEXT: v_mul_lo_u32 v5, v8, v13
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0
-; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
-; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT: v_xor_b32_e32 v6, v0, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v4, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
-; GFX9-NEXT: v_mul_hi_u32 v7, v6, v2
-; GFX9-NEXT: v_xor_b32_e32 v5, v5, v4
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v3, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v7, v10, v2
-; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0
-; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7
-; GFX9-NEXT: v_sub_u32_e32 v7, v5, v1
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v7, v10, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v7, s[4:5], v0, v11
-; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[4:5]
-; GFX9-NEXT: v_add_co_u32_e64 v7, s[4:5], 2, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, v3, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10
-; GFX9-NEXT: v_add_co_u32_e64 v12, s[4:5], 1, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v11
-; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, v3, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v10
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v12, v7, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v13, v8, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: v_xor_b32_e32 v2, v4, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX9-NEXT: v_mul_lo_u32 v5, v11, v9
+; GFX9-NEXT: v_mul_lo_u32 v6, v10, v12
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_add3_u32 v10, v3, v6, v5
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v10, 0
+; GFX9-NEXT: v_mul_hi_u32 v3, v9, v2
+; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, 0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v11, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v13, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v6, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v2, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
+; GFX9-NEXT: v_xor_b32_e32 v10, v0, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v12, v3, vcc
+; GFX9-NEXT: v_xor_b32_e32 v6, v1, v5
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_mul_hi_u32 v11, v10, v2
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, 0
+; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v11, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v12, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 32, v[0:1]
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT: v_mul_lo_u32 v9, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v0, 0
+; GFX9-NEXT: v_mul_lo_u32 v11, v8, v1
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT: v_add3_u32 v3, v3, v11, v9
+; GFX9-NEXT: v_sub_u32_e32 v9, v6, v3
+; GFX9-NEXT: v_subb_co_u32_e64 v9, s[4:5], v9, v7, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v2, v8
+; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[4:5], 0, v9, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v10, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 2, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v1, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
+; GFX9-NEXT: v_add_co_u32_e64 v12, s[4:5], 1, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v12, v10, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v13, v11, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_xor_b32_e32 v2, v5, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v1, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v0, v2, vcc
+; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v2, vcc
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: .LBB0_2: ; %Flow
@@ -192,70 +198,76 @@ define i64 @udiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13
-; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
-; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
-; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4
-; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
-; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 32, v[4:5]
+; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-NEXT: v_mul_lo_u32 v6, v11, v9
+; GFX9-NEXT: v_mul_lo_u32 v7, v10, v8
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_add3_u32 v7, v5, v7, v6
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
+; GFX9-NEXT: v_mul_hi_u32 v10, v9, v4
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v5
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6
-; GFX9-NEXT: v_mul_lo_u32 v9, v2, v7
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
-; GFX9-NEXT: v_add3_u32 v5, v5, v9, v8
-; GFX9-NEXT: v_sub_u32_e32 v8, v1, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v8, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 32, v[4:5]
+; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0
+; GFX9-NEXT: v_mul_hi_u32 v9, v0, v6
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v6, 0
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v7, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 32, v[4:5]
+; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT: v_mul_lo_u32 v8, v3, v4
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v4, 0
+; GFX9-NEXT: v_mul_lo_u32 v9, v2, v5
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6
+; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8
+; GFX9-NEXT: v_sub_u32_e32 v8, v1, v7
+; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v8, v3, vcc
; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v0, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3
+; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v8, s[4:5]
-; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v6
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v8, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5]
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v9, s[4:5]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v11, v9, s[4:5]
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: .LBB1_2: ; %Flow
@@ -306,26 +318,26 @@ define i64 @srem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
-; GFX9-NEXT: v_xor_b32_e32 v9, v3, v4
-; GFX9-NEXT: v_xor_b32_e32 v10, v2, v4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v10
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v9
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v10
-; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
+; GFX9-NEXT: v_xor_b32_e32 v6, v3, v4
+; GFX9-NEXT: v_xor_b32_e32 v7, v2, v4
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v6
+; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, 0, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v6, vcc
; GFX9-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
; GFX9-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_madmk_f32 v2, v3, 0xcf800000, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v3
-; GFX9-NEXT: v_mul_lo_u32 v4, v8, v6
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
-; GFX9-NEXT: v_mul_lo_u32 v5, v7, v11
-; GFX9-NEXT: v_mul_hi_u32 v12, v6, v2
+; GFX9-NEXT: v_mul_lo_u32 v4, v10, v8
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; GFX9-NEXT: v_mul_lo_u32 v5, v9, v11
+; GFX9-NEXT: v_mul_hi_u32 v12, v8, v2
; GFX9-NEXT: v_add3_u32 v5, v3, v5, v4
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v5, 0
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v3
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v4, vcc
@@ -333,78 +345,84 @@ define i64 @srem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v13, v3, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v6, v2
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, v7, v11
-; GFX9-NEXT: v_mul_lo_u32 v5, v8, v12
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
-; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v5, 0
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0
-; GFX9-NEXT: v_mul_hi_u32 v13, v12, v2
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v2, 0
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
+; GFX9-NEXT: v_mul_lo_u32 v4, v10, v8
+; GFX9-NEXT: v_mul_lo_u32 v5, v9, v11
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; GFX9-NEXT: v_add3_u32 v5, v3, v5, v4
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v5, 0
+; GFX9-NEXT: v_mul_hi_u32 v9, v8, v2
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v3
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v4, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v10, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_xor_b32_e32 v6, v0, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
-; GFX9-NEXT: v_mul_hi_u32 v7, v6, v2
-; GFX9-NEXT: v_xor_b32_e32 v4, v4, v5
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT: v_xor_b32_e32 v9, v0, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v3, vcc
+; GFX9-NEXT: v_xor_b32_e32 v4, v1, v5
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v8, 0
+; GFX9-NEXT: v_mul_hi_u32 v10, v9, v2
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v2, 0
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v8, 0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v11, v3, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v2, v9, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, v10, v1
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v0, 0
-; GFX9-NEXT: v_add3_u32 v1, v1, v3, v2
-; GFX9-NEXT: v_sub_u32_e32 v2, v4, v1
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_subb_co_u32_e64 v2, s[4:5], v2, v9, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v3, s[4:5], v0, v10
-; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[6:7], 0, v2, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v3, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v9
-; GFX9-NEXT: v_subb_co_u32_e64 v2, s[4:5], v2, v9, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[6:7]
-; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v3, v10
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v9
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 32, v[0:1]
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT: v_mul_lo_u32 v8, v6, v0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v0, 0
+; GFX9-NEXT: v_mul_lo_u32 v0, v7, v1
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v9, v2
+; GFX9-NEXT: v_add3_u32 v0, v3, v0, v8
+; GFX9-NEXT: v_sub_u32_e32 v1, v4, v0
+; GFX9-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v6, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v3, s[4:5], v2, v7
+; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[6:7], 0, v1, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v3, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v4, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v6
+; GFX9-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v6, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7]
+; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v3, v7
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7
+; GFX9-NEXT: v_subbrev_co_u32_e64 v1, s[4:5], 0, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5]
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v8, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_xor_b32_e32 v1, v1, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v5
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v1, v5
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v0, v5, vcc
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: .LBB2_2: ; %Flow
@@ -475,67 +493,73 @@ define i64 @urem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13
-; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
-; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
-; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4
-; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
-; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 32, v[4:5]
+; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-NEXT: v_mul_lo_u32 v6, v11, v9
+; GFX9-NEXT: v_mul_lo_u32 v7, v10, v8
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_add3_u32 v7, v5, v7, v6
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
+; GFX9-NEXT: v_mul_hi_u32 v10, v9, v4
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v5
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 32, v[4:5]
+; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0
+; GFX9-NEXT: v_mul_hi_u32 v9, v0, v6
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v6, 0
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v7, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v4
-; GFX9-NEXT: v_mul_lo_u32 v7, v2, v5
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v4, 0
-; GFX9-NEXT: v_add3_u32 v5, v5, v7, v6
-; GFX9-NEXT: v_sub_u32_e32 v6, v1, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v6, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 32, v[4:5]
+; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT: v_mul_lo_u32 v8, v3, v4
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v4, 0
+; GFX9-NEXT: v_mul_lo_u32 v4, v2, v5
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6
+; GFX9-NEXT: v_add3_u32 v4, v7, v4, v8
+; GFX9-NEXT: v_sub_u32_e32 v5, v1, v4
+; GFX9-NEXT: v_subb_co_u32_e64 v5, s[4:5], v5, v3, vcc
; GFX9-NEXT: v_sub_co_u32_e64 v6, s[4:5], v0, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[6:7], 0, v4, s[4:5]
+; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[6:7], 0, v5, s[4:5]
; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7]
; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v7, v3
-; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GFX9-NEXT: v_subb_co_u32_e64 v5, s[4:5], v5, v3, s[4:5]
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7]
; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v6, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
+; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[4:5], 0, v5, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v9, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
@@ -709,121 +733,127 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB8_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
-; GFX9-NEXT: v_xor_b32_e32 v10, v3, v9
-; GFX9-NEXT: v_xor_b32_e32 v11, v2, v9
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v11
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v10
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v11
-; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
-; GFX9-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
-; GFX9-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; GFX9-NEXT: v_trunc_f32_e32 v3, v3
-; GFX9-NEXT: v_madmk_f32 v2, v3, 0xcf800000, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v3
-; GFX9-NEXT: v_mul_lo_u32 v4, v8, v6
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
-; GFX9-NEXT: v_mul_lo_u32 v5, v7, v12
-; GFX9-NEXT: v_mul_hi_u32 v13, v6, v2
-; GFX9-NEXT: v_add3_u32 v5, v3, v5, v4
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
-; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, v7, v12
-; GFX9-NEXT: v_mul_lo_u32 v5, v8, v13
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0
-; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
-; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5
+; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v3, v4, vcc
+; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4
+; GFX9-NEXT: v_xor_b32_e32 v3, v5, v4
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v2
+; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v3
+; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v2, vcc
+; GFX9-NEXT: v_madmk_f32 v5, v6, 0x4f800000, v5
+; GFX9-NEXT: v_rcp_f32_e32 v5, v5
+; GFX9-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
+; GFX9-NEXT: v_trunc_f32_e32 v6, v6
+; GFX9-NEXT: v_madmk_f32 v5, v6, 0xcf800000, v5
+; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v5
+; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v6
+; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_mul_lo_u32 v8, v10, v12
+; GFX9-NEXT: v_mul_hi_u32 v13, v9, v5
+; GFX9-NEXT: v_add3_u32 v8, v6, v8, v7
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v8, 0
+; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v6
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v7, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v8, 0
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v13, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v14, v6, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v5, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[5:6], 32, v[5:6]
+; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v6, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9
+; GFX9-NEXT: v_mul_lo_u32 v8, v10, v12
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_add3_u32 v8, v6, v8, v7
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v8, 0
+; GFX9-NEXT: v_mul_hi_u32 v10, v9, v5
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v6
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v8, 0
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v6, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v5, v7
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[5:6], 32, v[5:6]
+; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v7
-; GFX9-NEXT: v_xor_b32_e32 v5, v0, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v7, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0
-; GFX9-NEXT: v_mul_hi_u32 v6, v5, v2
-; GFX9-NEXT: v_xor_b32_e32 v4, v4, v7
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v10, v2
-; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0
-; GFX9-NEXT: v_add3_u32 v1, v1, v8, v6
-; GFX9-NEXT: v_sub_u32_e32 v6, v4, v1
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v5, v0
-; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v6, v10, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v0, v11
-; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[6:7], 0, v6, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v10
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v9, v5
+; GFX9-NEXT: v_xor_b32_e32 v10, v0, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v12, v6, vcc
+; GFX9-NEXT: v_xor_b32_e32 v8, v1, v7
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_mul_hi_u32 v11, v10, v5
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v9, 0
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v11, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v12, v6, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 32, v[0:1]
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX9-NEXT: v_mul_lo_u32 v9, v2, v0
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v0, 0
+; GFX9-NEXT: v_mul_lo_u32 v11, v3, v1
+; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, v10, v5
+; GFX9-NEXT: v_add3_u32 v6, v6, v11, v9
+; GFX9-NEXT: v_sub_u32_e32 v9, v8, v6
+; GFX9-NEXT: v_subb_co_u32_e64 v9, s[4:5], v9, v2, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v11, s[4:5], v10, v3
+; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[6:7], 0, v9, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v2
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v11
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v3
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v12, v10
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v12, v2
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[6:7]
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 2, v2
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v3, s[6:7]
-; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 1, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v3, s[6:7]
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10
+; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 2, v0
+; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v1, s[6:7]
+; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 1, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v6, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v1, s[6:7]
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2
; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3
; GFX9-NEXT: v_cndmask_b32_e64 v5, v16, v14, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v15, v13, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT: v_xor_b32_e32 v5, v7, v9
-; GFX9-NEXT: v_xor_b32_e32 v2, v2, v5
-; GFX9-NEXT: v_xor_b32_e32 v3, v3, v5
-; GFX9-NEXT: v_sub_co_u32_e64 v4, s[8:9], v2, v5
-; GFX9-NEXT: v_subb_co_u32_e64 v2, s[4:5], v6, v10, s[4:5]
-; GFX9-NEXT: v_subb_co_u32_e64 v5, s[8:9], v3, v5, s[8:9]
-; GFX9-NEXT: v_sub_co_u32_e64 v3, s[4:5], v8, v11
-; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v12, v2, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v6, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v15, v13, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX9-NEXT: v_xor_b32_e32 v5, v7, v4
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5
+; GFX9-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX9-NEXT: v_sub_co_u32_e64 v4, s[8:9], v0, v5
+; GFX9-NEXT: v_subb_co_u32_e64 v0, s[4:5], v9, v2, s[4:5]
+; GFX9-NEXT: v_subb_co_u32_e64 v5, s[8:9], v1, v5, s[8:9]
+; GFX9-NEXT: v_sub_co_u32_e64 v1, s[4:5], v11, v3
+; GFX9-NEXT: v_subbrev_co_u32_e64 v0, s[4:5], 0, v0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7
-; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v0, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v7, vcc
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7
+; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v1, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v0, v7, vcc
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: .LBB8_2: ; %Flow
@@ -905,76 +935,82 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13
-; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
-; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
-; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4
-; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
-; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 32, v[4:5]
+; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-NEXT: v_mul_lo_u32 v6, v11, v9
+; GFX9-NEXT: v_mul_lo_u32 v7, v10, v8
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_add3_u32 v7, v5, v7, v6
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
+; GFX9-NEXT: v_mul_hi_u32 v10, v9, v4
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v5
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6
-; GFX9-NEXT: v_mul_lo_u32 v9, v2, v7
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
-; GFX9-NEXT: v_add3_u32 v5, v5, v9, v8
-; GFX9-NEXT: v_sub_u32_e32 v8, v1, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v3, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v0, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 32, v[4:5]
+; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0
+; GFX9-NEXT: v_mul_hi_u32 v9, v0, v6
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v6, 0
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v7, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 32, v[4:5]
+; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT: v_mul_lo_u32 v8, v3, v4
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v4, 0
+; GFX9-NEXT: v_mul_lo_u32 v9, v2, v5
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6
+; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8
+; GFX9-NEXT: v_sub_u32_e32 v8, v1, v7
+; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v8, v3, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v0, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[6:7], 0, v6, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v11, s[6:7]
-; GFX9-NEXT: v_add_co_u32_e64 v11, s[6:7], 2, v6
-; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, v7, s[6:7]
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 1, v6
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v7, s[6:7]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v9, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[6:7]
+; GFX9-NEXT: v_add_co_u32_e64 v11, s[6:7], 2, v4
+; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, v5, s[6:7]
+; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 1, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v5, s[6:7]
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v14, v12, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v14, v12, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_subb_co_u32_e64 v3, s[4:5], v8, v3, s[4:5]
-; GFX9-NEXT: v_sub_co_u32_e64 v2, s[4:5], v9, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc
+; GFX9-NEXT: v_subb_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
+; GFX9-NEXT: v_sub_co_u32_e64 v2, s[4:5], v8, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v13, v11, s[6:7]
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v11, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v2, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v2, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 15ebdd70ae8818..b602b2411776e6 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -453,8 +453,9 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
;
; VI-LABEL: ps_mesa_inreg_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_and_b32 s1, s0, 0xffff0000
+; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: s_add_i32 s0, s0, 1
+; VI-NEXT: s_lshl_b32 s1, s1, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: s_add_i32 s0, s0, 0x10000
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 9a22635e880f17..67b9486f7d1a80 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -1810,71 +1810,77 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; CISI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CISI-NEXT: v_trunc_f32_e32 v1, v1
; CISI-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; CISI-NEXT: v_cvt_u32_f32_e32 v1, v1
-; CISI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CISI-NEXT: v_mul_lo_u32 v2, s0, v1
-; CISI-NEXT: v_mul_hi_u32 v3, s0, v0
-; CISI-NEXT: v_mul_lo_u32 v5, s1, v0
-; CISI-NEXT: v_mul_lo_u32 v4, s0, v0
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CISI-NEXT: v_mul_hi_u32 v3, v0, v4
-; CISI-NEXT: v_mul_lo_u32 v5, v0, v2
-; CISI-NEXT: v_mul_hi_u32 v7, v0, v2
-; CISI-NEXT: v_mul_lo_u32 v6, v1, v4
-; CISI-NEXT: v_mul_hi_u32 v4, v1, v4
-; CISI-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CISI-NEXT: v_cvt_u32_f32_e32 v2, v1
+; CISI-NEXT: v_cvt_u32_f32_e32 v3, v0
+; CISI-NEXT: v_mul_lo_u32 v0, s0, v2
+; CISI-NEXT: v_mul_hi_u32 v1, s0, v3
+; CISI-NEXT: v_mul_lo_u32 v5, s1, v3
+; CISI-NEXT: v_mul_lo_u32 v4, s0, v3
+; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; CISI-NEXT: v_mul_hi_u32 v1, v3, v4
+; CISI-NEXT: v_mul_lo_u32 v5, v3, v0
+; CISI-NEXT: v_mul_hi_u32 v7, v3, v0
+; CISI-NEXT: v_mul_lo_u32 v6, v2, v4
+; CISI-NEXT: v_mul_hi_u32 v4, v2, v4
+; CISI-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; CISI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; CISI-NEXT: v_mul_hi_u32 v7, v1, v2
-; CISI-NEXT: v_mul_lo_u32 v2, v1, v2
-; CISI-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CISI-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
+; CISI-NEXT: v_mul_hi_u32 v7, v2, v0
+; CISI-NEXT: v_mul_lo_u32 v0, v2, v0
+; CISI-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; CISI-NEXT: v_addc_u32_e32 v1, vcc, v5, v4, vcc
; CISI-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CISI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CISI-NEXT: v_mul_lo_u32 v2, s0, v1
-; CISI-NEXT: v_mul_hi_u32 v3, s0, v0
-; CISI-NEXT: v_mul_lo_u32 v4, s1, v0
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CISI-NEXT: v_mul_lo_u32 v3, s0, v0
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CISI-NEXT: v_mul_lo_u32 v6, v0, v2
-; CISI-NEXT: v_mul_hi_u32 v7, v0, v3
-; CISI-NEXT: v_mul_hi_u32 v8, v0, v2
-; CISI-NEXT: v_mul_hi_u32 v5, v1, v3
-; CISI-NEXT: v_mul_lo_u32 v3, v1, v3
-; CISI-NEXT: v_mul_hi_u32 v4, v1, v2
-; CISI-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CISI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; CISI-NEXT: v_mul_lo_u32 v2, v1, v2
-; CISI-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CISI-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; CISI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CISI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CISI-NEXT: v_mul_lo_u32 v2, s6, v1
-; CISI-NEXT: v_mul_hi_u32 v3, s6, v0
-; CISI-NEXT: v_mul_hi_u32 v4, s6, v1
+; CISI-NEXT: v_add_i32_e32 v5, vcc, v1, v0
+; CISI-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; CISI-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; CISI-NEXT: v_or_b32_e32 v0, v0, v5
+; CISI-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; CISI-NEXT: v_addc_u32_e32 v2, vcc, v2, v1, vcc
+; CISI-NEXT: v_mul_hi_u32 v1, s0, v3
+; CISI-NEXT: v_mul_lo_u32 v4, s0, v2
+; CISI-NEXT: v_mul_lo_u32 v6, s1, v3
+; CISI-NEXT: v_mul_lo_u32 v0, s0, v3
+; CISI-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; CISI-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; CISI-NEXT: v_mul_lo_u32 v4, v3, v1
+; CISI-NEXT: v_mul_hi_u32 v6, v3, v0
+; CISI-NEXT: v_mul_hi_u32 v7, v3, v1
+; CISI-NEXT: v_mul_hi_u32 v5, v2, v0
+; CISI-NEXT: v_mul_lo_u32 v0, v2, v0
+; CISI-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; CISI-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; CISI-NEXT: v_mul_hi_u32 v7, v2, v1
+; CISI-NEXT: v_mul_lo_u32 v1, v2, v1
+; CISI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; CISI-NEXT: v_addc_u32_e32 v0, vcc, v6, v5, vcc
+; CISI-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; CISI-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; CISI-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; CISI-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; CISI-NEXT: v_or_b32_e32 v0, v0, v5
+; CISI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CISI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; CISI-NEXT: v_mul_lo_u32 v3, s6, v1
+; CISI-NEXT: v_mul_hi_u32 v4, s6, v0
+; CISI-NEXT: v_mul_hi_u32 v5, s6, v1
+; CISI-NEXT: v_mul_hi_u32 v2, s7, v0
+; CISI-NEXT: v_mul_lo_u32 v0, s7, v0
+; CISI-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CISI-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
; CISI-NEXT: v_mul_hi_u32 v5, s7, v1
; CISI-NEXT: v_mul_lo_u32 v1, s7, v1
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; CISI-NEXT: v_mul_lo_u32 v4, s7, v0
-; CISI-NEXT: v_mul_hi_u32 v0, s7, v0
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CISI-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; CISI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CISI-NEXT: v_addc_u32_e32 v0, vcc, v4, v2, vcc
; CISI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; CISI-NEXT: v_mul_lo_u32 v2, s2, v1
-; CISI-NEXT: v_mul_hi_u32 v3, s2, v0
-; CISI-NEXT: v_mul_lo_u32 v4, s3, v0
+; CISI-NEXT: v_add_i32_e32 v3, vcc, v0, v1
+; CISI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; CISI-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
; CISI-NEXT: v_mov_b32_e32 v5, s3
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CISI-NEXT: v_or_b32_e32 v0, v0, v3
+; CISI-NEXT: v_mul_hi_u32 v2, s2, v0
+; CISI-NEXT: v_mul_lo_u32 v3, s2, v1
+; CISI-NEXT: v_mul_lo_u32 v4, s3, v0
+; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CISI-NEXT: v_mul_lo_u32 v3, s2, v0
; CISI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; CISI-NEXT: v_sub_i32_e32 v4, vcc, s7, v2
@@ -1976,53 +1982,59 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: v_add_u32_e32 v0, vcc, v6, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, v7, v1, vcc
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v0
-; VI-NEXT: v_addc_u32_e32 v7, vcc, v4, v1, vcc
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
-; VI-NEXT: v_mul_lo_u32 v4, s8, v7
-; VI-NEXT: v_mul_lo_u32 v5, s9, v6
-; VI-NEXT: v_mul_hi_u32 v8, v6, v0
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0
-; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1
-; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v5
-; VI-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v1, 0
-; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2
-; VI-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; VI-NEXT: v_lshlrev_b64 v[0:1], 32, v[0:1]
+; VI-NEXT: v_or_b32_e32 v0, v0, v2
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; VI-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; VI-NEXT: v_mul_lo_u32 v2, s8, v6
+; VI-NEXT: v_mul_lo_u32 v3, s9, v5
+; VI-NEXT: v_mul_hi_u32 v8, v5, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v1, v3
+; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v7, 0
+; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v6, v0, 0
+; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v7, 0
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, v8, v3
+; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, v7, v1, vcc
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0
-; VI-NEXT: v_mul_hi_u32 v4, s6, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; VI-NEXT: v_lshlrev_b64 v[0:1], 32, v[0:1]
+; VI-NEXT: v_or_b32_e32 v0, v0, v2
+; VI-NEXT: v_add_u32_e32 v4, vcc, v5, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, v6, v1, vcc
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v5, 0
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v4, 0
+; VI-NEXT: v_readfirstlane_b32 s10, v3
+; VI-NEXT: v_mul_hi_u32 v3, s6, v4
; VI-NEXT: v_readfirstlane_b32 s8, v1
; VI-NEXT: v_readfirstlane_b32 s9, v0
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v3, 0
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s7, v2, 0
-; VI-NEXT: v_readfirstlane_b32 s10, v4
-; VI-NEXT: s_add_u32 s0, s10, s9
-; VI-NEXT: s_addc_u32 s1, 0, s8
-; VI-NEXT: v_readfirstlane_b32 s10, v2
-; VI-NEXT: v_readfirstlane_b32 s9, v3
-; VI-NEXT: s_add_u32 s0, s0, s10
-; VI-NEXT: v_readfirstlane_b32 s8, v1
-; VI-NEXT: s_addc_u32 s0, s1, s9
-; VI-NEXT: s_addc_u32 s10, s8, 0
-; VI-NEXT: v_readfirstlane_b32 s1, v0
-; VI-NEXT: s_add_u32 s11, s0, s1
-; VI-NEXT: v_mov_b32_e32 v2, s11
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v5, 0
+; VI-NEXT: v_readfirstlane_b32 s11, v2
+; VI-NEXT: v_readfirstlane_b32 s0, v3
+; VI-NEXT: s_add_u32 s0, s0, s11
+; VI-NEXT: s_addc_u32 s1, 0, s10
+; VI-NEXT: s_add_u32 s0, s0, s9
+; VI-NEXT: v_readfirstlane_b32 s10, v1
+; VI-NEXT: s_addc_u32 s0, s1, s8
+; VI-NEXT: s_addc_u32 s1, s10, 0
+; VI-NEXT: v_readfirstlane_b32 s8, v0
+; VI-NEXT: s_add_u32 s8, s0, s8
+; VI-NEXT: s_addc_u32 s0, 0, s1
+; VI-NEXT: s_lshl_b64 s[10:11], s[0:1], 32
+; VI-NEXT: s_or_b32 s10, s10, s8
+; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, 0
-; VI-NEXT: s_addc_u32 s10, 0, s10
-; VI-NEXT: s_mul_i32 s0, s2, s10
+; VI-NEXT: s_mul_i32 s12, s2, s11
+; VI-NEXT: s_mul_i32 s0, s3, s10
; VI-NEXT: v_readfirstlane_b32 s1, v1
-; VI-NEXT: s_add_i32 s0, s1, s0
-; VI-NEXT: s_mul_i32 s1, s3, s11
-; VI-NEXT: s_add_i32 s12, s0, s1
+; VI-NEXT: s_add_i32 s1, s1, s12
+; VI-NEXT: s_add_i32 s12, s1, s0
; VI-NEXT: s_sub_i32 s0, s7, s12
; VI-NEXT: v_sub_u32_e32 v0, vcc, s6, v0
; VI-NEXT: s_cmp_lg_u64 vcc, 0
@@ -2038,16 +2050,16 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: v_mov_b32_e32 v3, s14
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; VI-NEXT: s_add_u32 s0, s11, 1
-; VI-NEXT: s_addc_u32 s13, s10, 0
-; VI-NEXT: s_add_u32 s1, s11, 2
-; VI-NEXT: s_addc_u32 s11, s10, 0
+; VI-NEXT: s_add_u32 s0, s10, 1
+; VI-NEXT: s_addc_u32 s13, s11, 0
+; VI-NEXT: s_add_u32 s1, s10, 2
+; VI-NEXT: s_addc_u32 s10, s11, 0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
; VI-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s13
-; VI-NEXT: v_mov_b32_e32 v4, s11
+; VI-NEXT: v_mov_b32_e32 v4, s10
; VI-NEXT: s_cmp_lg_u64 vcc, 0
; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1]
; VI-NEXT: s_subb_u32 s0, s7, s12
@@ -2059,7 +2071,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v4, s11
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
@@ -2109,8 +2121,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT: s_sub_u32 s0, 0, s2
-; GFX9-NEXT: s_subb_u32 s1, 0, s3
+; GFX9-NEXT: s_sub_u32 s10, 0, s2
+; GFX9-NEXT: s_subb_u32 s11, 0, s3
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2119,105 +2131,111 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s10, v1
-; GFX9-NEXT: v_readfirstlane_b32 s11, v0
-; GFX9-NEXT: s_mul_i32 s12, s0, s10
-; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11
-; GFX9-NEXT: s_mul_i32 s13, s1, s11
-; GFX9-NEXT: s_add_i32 s12, s14, s12
-; GFX9-NEXT: s_add_i32 s12, s12, s13
-; GFX9-NEXT: s_mul_i32 s15, s0, s11
-; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
-; GFX9-NEXT: s_mul_i32 s14, s11, s12
-; GFX9-NEXT: s_mul_hi_u32 s11, s11, s15
-; GFX9-NEXT: s_add_u32 s11, s11, s14
+; GFX9-NEXT: v_readfirstlane_b32 s12, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mul_i32 s1, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s14, s10, s0
+; GFX9-NEXT: s_mul_i32 s13, s11, s0
+; GFX9-NEXT: s_add_i32 s1, s14, s1
+; GFX9-NEXT: s_add_i32 s1, s1, s13
+; GFX9-NEXT: s_mul_i32 s15, s10, s0
+; GFX9-NEXT: s_mul_hi_u32 s13, s0, s1
+; GFX9-NEXT: s_mul_i32 s14, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s0, s15
+; GFX9-NEXT: s_add_u32 s0, s0, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15
-; GFX9-NEXT: s_mul_i32 s15, s10, s15
-; GFX9-NEXT: s_add_u32 s11, s11, s15
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12
-; GFX9-NEXT: s_addc_u32 s11, s13, s16
+; GFX9-NEXT: s_mul_hi_u32 s16, s12, s15
+; GFX9-NEXT: s_mul_i32 s15, s12, s15
+; GFX9-NEXT: s_add_u32 s0, s0, s15
+; GFX9-NEXT: s_mul_hi_u32 s14, s12, s1
+; GFX9-NEXT: s_addc_u32 s0, s13, s16
; GFX9-NEXT: s_addc_u32 s13, s14, 0
-; GFX9-NEXT: s_mul_i32 s12, s10, s12
-; GFX9-NEXT: s_add_u32 s11, s11, s12
-; GFX9-NEXT: s_addc_u32 s12, 0, s13
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s11, v0
+; GFX9-NEXT: s_mul_i32 s1, s12, s1
+; GFX9-NEXT: s_add_u32 s14, s0, s1
+; GFX9-NEXT: s_addc_u32 s0, 0, s13
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX9-NEXT: s_or_b32 s0, s0, s14
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s10, s10, s12
-; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: s_mul_i32 s11, s0, s10
-; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12
-; GFX9-NEXT: s_add_i32 s11, s13, s11
-; GFX9-NEXT: s_mul_i32 s1, s1, s12
-; GFX9-NEXT: s_add_i32 s11, s11, s1
-; GFX9-NEXT: s_mul_i32 s0, s0, s12
-; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0
-; GFX9-NEXT: s_mul_i32 s14, s10, s0
-; GFX9-NEXT: s_mul_i32 s16, s12, s11
-; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0
-; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11
-; GFX9-NEXT: s_add_u32 s0, s0, s16
-; GFX9-NEXT: s_addc_u32 s12, 0, s15
+; GFX9-NEXT: s_addc_u32 s12, s12, s1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mul_i32 s1, s10, s0
+; GFX9-NEXT: s_mul_hi_u32 s15, s10, s0
+; GFX9-NEXT: s_mul_i32 s10, s10, s12
+; GFX9-NEXT: s_add_i32 s10, s15, s10
+; GFX9-NEXT: s_mul_i32 s11, s11, s0
+; GFX9-NEXT: s_add_i32 s10, s10, s11
+; GFX9-NEXT: s_mul_hi_u32 s11, s0, s10
+; GFX9-NEXT: s_mul_i32 s15, s0, s10
+; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1
+; GFX9-NEXT: s_add_u32 s0, s0, s15
+; GFX9-NEXT: s_mul_hi_u32 s13, s12, s1
+; GFX9-NEXT: s_mul_i32 s14, s12, s1
+; GFX9-NEXT: s_addc_u32 s1, 0, s11
; GFX9-NEXT: s_add_u32 s0, s0, s14
-; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11
-; GFX9-NEXT: s_addc_u32 s0, s12, s13
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_mul_i32 s11, s10, s11
-; GFX9-NEXT: s_add_u32 s0, s0, s11
-; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
+; GFX9-NEXT: s_addc_u32 s0, s1, s13
+; GFX9-NEXT: s_addc_u32 s1, s11, 0
+; GFX9-NEXT: s_mul_i32 s10, s12, s10
+; GFX9-NEXT: s_add_u32 s10, s0, s10
+; GFX9-NEXT: s_addc_u32 s0, 0, s1
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX9-NEXT: s_or_b32 s0, s0, s10
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s0, s10, s1
-; GFX9-NEXT: v_readfirstlane_b32 s11, v0
-; GFX9-NEXT: s_mul_i32 s10, s6, s0
-; GFX9-NEXT: s_mul_hi_u32 s12, s6, s11
-; GFX9-NEXT: s_mul_hi_u32 s1, s6, s0
-; GFX9-NEXT: s_add_u32 s10, s12, s10
-; GFX9-NEXT: s_addc_u32 s1, 0, s1
-; GFX9-NEXT: s_mul_hi_u32 s13, s7, s11
-; GFX9-NEXT: s_mul_i32 s11, s7, s11
-; GFX9-NEXT: s_add_u32 s10, s10, s11
-; GFX9-NEXT: s_mul_hi_u32 s12, s7, s0
-; GFX9-NEXT: s_addc_u32 s1, s1, s13
-; GFX9-NEXT: s_addc_u32 s10, s12, 0
-; GFX9-NEXT: s_mul_i32 s0, s7, s0
-; GFX9-NEXT: s_add_u32 s11, s1, s0
-; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_i32 s0, s2, s10
-; GFX9-NEXT: s_mul_hi_u32 s1, s2, s11
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_addc_u32 s1, s12, s1
+; GFX9-NEXT: s_mul_hi_u32 s10, s7, s0
+; GFX9-NEXT: s_mul_i32 s11, s7, s0
+; GFX9-NEXT: s_mul_i32 s13, s6, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0
+; GFX9-NEXT: s_mul_hi_u32 s12, s6, s1
+; GFX9-NEXT: s_add_u32 s0, s0, s13
+; GFX9-NEXT: s_addc_u32 s12, 0, s12
+; GFX9-NEXT: s_add_u32 s0, s0, s11
+; GFX9-NEXT: s_mul_hi_u32 s13, s7, s1
+; GFX9-NEXT: s_addc_u32 s0, s12, s10
+; GFX9-NEXT: s_addc_u32 s10, s13, 0
+; GFX9-NEXT: s_mul_i32 s1, s7, s1
+; GFX9-NEXT: s_add_u32 s12, s0, s1
+; GFX9-NEXT: s_addc_u32 s0, 0, s10
+; GFX9-NEXT: s_mul_hi_u32 s1, s2, s12
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 32
+; GFX9-NEXT: s_mul_i32 s0, s2, s11
; GFX9-NEXT: s_add_i32 s0, s1, s0
-; GFX9-NEXT: s_mul_i32 s1, s3, s11
-; GFX9-NEXT: s_add_i32 s12, s0, s1
-; GFX9-NEXT: s_mul_i32 s1, s2, s11
+; GFX9-NEXT: s_mul_i32 s1, s3, s12
+; GFX9-NEXT: s_add_i32 s13, s0, s1
+; GFX9-NEXT: s_mul_i32 s1, s2, s12
; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: s_sub_i32 s0, s7, s12
+; GFX9-NEXT: s_sub_i32 s0, s7, s13
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s13, s0, s3
+; GFX9-NEXT: s_subb_u32 s14, s0, s3
; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s2, v0
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_subb_u32 s13, s13, 0
-; GFX9-NEXT: s_cmp_ge_u32 s13, s3
-; GFX9-NEXT: s_cselect_b32 s14, -1, 0
+; GFX9-NEXT: s_subb_u32 s14, s14, 0
+; GFX9-NEXT: s_cmp_ge_u32 s14, s3
+; GFX9-NEXT: s_cselect_b32 s15, -1, 0
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1
-; GFX9-NEXT: s_cmp_eq_u32 s13, s3
+; GFX9-NEXT: s_cmp_eq_u32 s14, s3
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s14
+; GFX9-NEXT: v_mov_b32_e32 v2, s15
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_or_b32 s10, s10, s12
; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s11, 1
-; GFX9-NEXT: s_addc_u32 s13, s10, 0
-; GFX9-NEXT: s_add_u32 s1, s11, 2
-; GFX9-NEXT: s_addc_u32 s14, s10, 0
+; GFX9-NEXT: s_add_u32 s0, s10, 1
+; GFX9-NEXT: s_addc_u32 s12, s11, 0
+; GFX9-NEXT: s_add_u32 s1, s10, 2
+; GFX9-NEXT: s_addc_u32 s14, s11, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v1, s13
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
; GFX9-NEXT: v_mov_b32_e32 v3, s14
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX9-NEXT: s_subb_u32 s0, s7, s12
+; GFX9-NEXT: s_subb_u32 s0, s7, s13
; GFX9-NEXT: s_cmp_ge_u32 s0, s3
; GFX9-NEXT: s_cselect_b32 s1, -1, 0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
@@ -2226,9 +2244,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s11
+; GFX9-NEXT: v_mov_b32_e32 v0, s10
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_cbranch_execnz .LBB16_3
@@ -2277,8 +2295,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: ; %bb.1:
; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX1010-NEXT: s_sub_u32 s9, 0, s2
-; GFX1010-NEXT: s_subb_u32 s10, 0, s3
+; GFX1010-NEXT: s_sub_u32 s10, 0, s2
+; GFX1010-NEXT: s_subb_u32 s11, 0, s3
; GFX1010-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX1010-NEXT: v_rcp_f32_e32 v0, v0
; GFX1010-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2287,111 +2305,117 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1010-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1010-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1010-NEXT: v_readfirstlane_b32 s1, v0
-; GFX1010-NEXT: s_mul_i32 s11, s9, s0
-; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s1
-; GFX1010-NEXT: s_mul_i32 s12, s10, s1
-; GFX1010-NEXT: s_add_i32 s11, s13, s11
-; GFX1010-NEXT: s_mul_i32 s14, s9, s1
-; GFX1010-NEXT: s_add_i32 s11, s11, s12
-; GFX1010-NEXT: s_mul_hi_u32 s13, s1, s14
-; GFX1010-NEXT: s_mul_hi_u32 s15, s0, s14
-; GFX1010-NEXT: s_mul_i32 s12, s0, s14
-; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s11
-; GFX1010-NEXT: s_mul_i32 s1, s1, s11
-; GFX1010-NEXT: s_mul_hi_u32 s16, s0, s11
-; GFX1010-NEXT: s_add_u32 s1, s13, s1
+; GFX1010-NEXT: v_readfirstlane_b32 s9, v1
+; GFX1010-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1010-NEXT: s_mul_i32 s1, s10, s9
+; GFX1010-NEXT: s_mul_hi_u32 s13, s10, s0
+; GFX1010-NEXT: s_mul_i32 s12, s11, s0
+; GFX1010-NEXT: s_add_i32 s1, s13, s1
+; GFX1010-NEXT: s_mul_i32 s14, s10, s0
+; GFX1010-NEXT: s_add_i32 s1, s1, s12
+; GFX1010-NEXT: s_mul_hi_u32 s13, s0, s14
+; GFX1010-NEXT: s_mul_hi_u32 s15, s9, s14
+; GFX1010-NEXT: s_mul_i32 s12, s9, s14
+; GFX1010-NEXT: s_mul_hi_u32 s14, s0, s1
+; GFX1010-NEXT: s_mul_i32 s0, s0, s1
+; GFX1010-NEXT: s_mul_hi_u32 s16, s9, s1
+; GFX1010-NEXT: s_add_u32 s0, s13, s0
; GFX1010-NEXT: s_addc_u32 s13, 0, s14
-; GFX1010-NEXT: s_add_u32 s1, s1, s12
-; GFX1010-NEXT: s_mul_i32 s11, s0, s11
-; GFX1010-NEXT: s_addc_u32 s1, s13, s15
+; GFX1010-NEXT: s_add_u32 s0, s0, s12
+; GFX1010-NEXT: s_mul_i32 s1, s9, s1
+; GFX1010-NEXT: s_addc_u32 s0, s13, s15
; GFX1010-NEXT: s_addc_u32 s12, s16, 0
-; GFX1010-NEXT: s_add_u32 s1, s1, s11
+; GFX1010-NEXT: s_add_u32 s13, s0, s1
+; GFX1010-NEXT: s_addc_u32 s0, 0, s12
+; GFX1010-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1010-NEXT: s_or_b32 s0, s0, s13
+; GFX1010-NEXT: v_add_co_u32 v0, s0, v0, s0
+; GFX1010-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1010-NEXT: s_addc_u32 s9, s9, s1
+; GFX1010-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1010-NEXT: s_mul_i32 s1, s10, s9
+; GFX1010-NEXT: s_mul_i32 s12, s10, s0
+; GFX1010-NEXT: s_mul_hi_u32 s10, s10, s0
+; GFX1010-NEXT: s_mul_i32 s11, s11, s0
+; GFX1010-NEXT: s_add_i32 s1, s10, s1
+; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s12
+; GFX1010-NEXT: s_add_i32 s1, s1, s11
+; GFX1010-NEXT: s_mul_i32 s10, s9, s12
+; GFX1010-NEXT: s_mul_hi_u32 s11, s0, s12
+; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s1
+; GFX1010-NEXT: s_mul_i32 s0, s0, s1
+; GFX1010-NEXT: s_mul_hi_u32 s14, s9, s1
+; GFX1010-NEXT: s_add_u32 s0, s11, s0
; GFX1010-NEXT: s_addc_u32 s11, 0, s12
-; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1010-NEXT: s_addc_u32 s0, s0, s11
-; GFX1010-NEXT: v_readfirstlane_b32 s1, v0
-; GFX1010-NEXT: s_mul_i32 s11, s9, s0
-; GFX1010-NEXT: s_mul_hi_u32 s12, s9, s1
-; GFX1010-NEXT: s_mul_i32 s10, s10, s1
-; GFX1010-NEXT: s_add_i32 s11, s12, s11
-; GFX1010-NEXT: s_mul_i32 s9, s9, s1
-; GFX1010-NEXT: s_add_i32 s11, s11, s10
-; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s9
-; GFX1010-NEXT: s_mul_i32 s13, s0, s9
-; GFX1010-NEXT: s_mul_hi_u32 s9, s1, s9
-; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s11
-; GFX1010-NEXT: s_mul_i32 s1, s1, s11
-; GFX1010-NEXT: s_mul_hi_u32 s10, s0, s11
-; GFX1010-NEXT: s_add_u32 s1, s9, s1
-; GFX1010-NEXT: s_addc_u32 s9, 0, s14
-; GFX1010-NEXT: s_add_u32 s1, s1, s13
-; GFX1010-NEXT: s_mul_i32 s11, s0, s11
-; GFX1010-NEXT: s_addc_u32 s1, s9, s12
-; GFX1010-NEXT: s_addc_u32 s9, s10, 0
-; GFX1010-NEXT: s_add_u32 s1, s1, s11
-; GFX1010-NEXT: s_addc_u32 s9, 0, s9
-; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1010-NEXT: s_addc_u32 s0, s0, s9
+; GFX1010-NEXT: s_add_u32 s0, s0, s10
+; GFX1010-NEXT: s_mul_i32 s1, s9, s1
+; GFX1010-NEXT: s_addc_u32 s0, s11, s13
+; GFX1010-NEXT: s_addc_u32 s10, s14, 0
+; GFX1010-NEXT: s_add_u32 s11, s0, s1
+; GFX1010-NEXT: s_addc_u32 s0, 0, s10
+; GFX1010-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1010-NEXT: s_or_b32 s0, s0, s11
+; GFX1010-NEXT: v_add_co_u32 v0, s0, v0, s0
+; GFX1010-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1010-NEXT: s_addc_u32 s0, s9, s1
; GFX1010-NEXT: v_readfirstlane_b32 s1, v0
; GFX1010-NEXT: s_mul_i32 s10, s6, s0
; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s0
; GFX1010-NEXT: s_mul_hi_u32 s11, s7, s0
; GFX1010-NEXT: s_mul_i32 s0, s7, s0
-; GFX1010-NEXT: s_mul_hi_u32 s12, s6, s1
-; GFX1010-NEXT: s_mul_hi_u32 s13, s7, s1
+; GFX1010-NEXT: s_mul_hi_u32 s13, s6, s1
+; GFX1010-NEXT: s_mul_hi_u32 s12, s7, s1
; GFX1010-NEXT: s_mul_i32 s1, s7, s1
-; GFX1010-NEXT: s_add_u32 s10, s12, s10
+; GFX1010-NEXT: s_add_u32 s10, s13, s10
; GFX1010-NEXT: s_addc_u32 s9, 0, s9
; GFX1010-NEXT: s_add_u32 s1, s10, s1
-; GFX1010-NEXT: s_addc_u32 s1, s9, s13
+; GFX1010-NEXT: s_addc_u32 s1, s9, s12
; GFX1010-NEXT: s_addc_u32 s9, s11, 0
-; GFX1010-NEXT: s_add_u32 s1, s1, s0
-; GFX1010-NEXT: s_addc_u32 s9, 0, s9
-; GFX1010-NEXT: s_mul_hi_u32 s0, s2, s1
-; GFX1010-NEXT: s_mul_i32 s11, s2, s9
+; GFX1010-NEXT: s_add_u32 s10, s1, s0
+; GFX1010-NEXT: s_addc_u32 s0, 0, s9
+; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s10
+; GFX1010-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1010-NEXT: s_mul_i32 s13, s2, s10
; GFX1010-NEXT: s_mul_i32 s12, s2, s1
-; GFX1010-NEXT: s_add_i32 s0, s0, s11
-; GFX1010-NEXT: v_sub_co_u32 v0, s11, s6, s12
-; GFX1010-NEXT: s_mul_i32 s10, s3, s1
-; GFX1010-NEXT: s_add_i32 s0, s0, s10
-; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s2
-; GFX1010-NEXT: s_sub_i32 s10, s7, s0
-; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1010-NEXT: s_subb_u32 s10, s10, s3
+; GFX1010-NEXT: s_mul_i32 s11, s3, s10
+; GFX1010-NEXT: s_add_i32 s9, s9, s12
+; GFX1010-NEXT: v_sub_co_u32 v0, s12, s6, s13
+; GFX1010-NEXT: s_add_i32 s9, s9, s11
+; GFX1010-NEXT: s_sub_i32 s11, s7, s9
+; GFX1010-NEXT: v_sub_co_u32 v1, s13, v0, s2
; GFX1010-NEXT: s_cmp_lg_u32 s12, 0
+; GFX1010-NEXT: s_subb_u32 s11, s11, s3
+; GFX1010-NEXT: s_cmp_lg_u32 s13, 0
; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
-; GFX1010-NEXT: s_subb_u32 s10, s10, 0
-; GFX1010-NEXT: s_cmp_ge_u32 s10, s3
+; GFX1010-NEXT: s_subb_u32 s11, s11, 0
+; GFX1010-NEXT: s_cmp_ge_u32 s11, s3
; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1010-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1010-NEXT: s_cmp_eq_u32 s10, s3
+; GFX1010-NEXT: s_cselect_b32 s13, -1, 0
+; GFX1010-NEXT: s_cmp_eq_u32 s11, s3
; GFX1010-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX1010-NEXT: s_add_u32 s10, s1, 1
-; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
-; GFX1010-NEXT: s_addc_u32 s12, s9, 0
-; GFX1010-NEXT: s_add_u32 s13, s1, 2
-; GFX1010-NEXT: s_addc_u32 s14, s9, 0
-; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1010-NEXT: s_or_b32 s10, s0, s10
+; GFX1010-NEXT: v_cndmask_b32_e32 v1, s13, v1, vcc_lo
+; GFX1010-NEXT: s_add_u32 s11, s10, 1
+; GFX1010-NEXT: s_addc_u32 s13, s1, 0
+; GFX1010-NEXT: s_add_u32 s0, s10, 2
+; GFX1010-NEXT: s_addc_u32 s14, s1, 0
+; GFX1010-NEXT: s_cmp_lg_u32 s12, 0
; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
-; GFX1010-NEXT: s_subb_u32 s0, s7, s0
-; GFX1010-NEXT: v_mov_b32_e32 v2, s13
-; GFX1010-NEXT: s_cmp_ge_u32 s0, s3
+; GFX1010-NEXT: s_subb_u32 s7, s7, s9
+; GFX1010-NEXT: v_mov_b32_e32 v2, s0
+; GFX1010-NEXT: s_cmp_ge_u32 s7, s3
; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX1010-NEXT: s_cselect_b32 s7, -1, 0
-; GFX1010-NEXT: s_cmp_eq_u32 s0, s3
+; GFX1010-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1010-NEXT: s_cmp_eq_u32 s7, s3
; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s14
-; GFX1010-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0
-; GFX1010-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo
-; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
+; GFX1010-NEXT: v_cndmask_b32_e64 v0, s9, v0, s0
+; GFX1010-NEXT: v_cndmask_b32_e32 v2, s11, v2, vcc_lo
+; GFX1010-NEXT: v_cndmask_b32_e32 v1, s13, v1, vcc_lo
; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1010-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo
-; GFX1010-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo
+; GFX1010-NEXT: v_cndmask_b32_e32 v1, s1, v1, vcc_lo
+; GFX1010-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc_lo
; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3
; GFX1010-NEXT: .LBB16_2:
@@ -2439,8 +2463,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: ; %bb.1:
; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX1030W32-NEXT: s_sub_u32 s9, 0, s2
-; GFX1030W32-NEXT: s_subb_u32 s10, 0, s3
+; GFX1030W32-NEXT: s_sub_u32 s10, 0, s2
+; GFX1030W32-NEXT: s_subb_u32 s11, 0, s3
; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
; GFX1030W32-NEXT: v_rcp_f32_e32 v0, v0
; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2449,111 +2473,117 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1030W32-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0
-; GFX1030W32-NEXT: s_mul_i32 s11, s9, s0
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s1
-; GFX1030W32-NEXT: s_mul_i32 s12, s10, s1
-; GFX1030W32-NEXT: s_add_i32 s11, s13, s11
-; GFX1030W32-NEXT: s_mul_i32 s14, s9, s1
-; GFX1030W32-NEXT: s_add_i32 s11, s11, s12
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s1, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s15, s0, s14
-; GFX1030W32-NEXT: s_mul_i32 s12, s0, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s11
-; GFX1030W32-NEXT: s_mul_i32 s1, s1, s11
-; GFX1030W32-NEXT: s_mul_hi_u32 s16, s0, s11
-; GFX1030W32-NEXT: s_add_u32 s1, s13, s1
+; GFX1030W32-NEXT: v_readfirstlane_b32 s9, v1
+; GFX1030W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1030W32-NEXT: s_mul_i32 s1, s10, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s10, s0
+; GFX1030W32-NEXT: s_mul_i32 s12, s11, s0
+; GFX1030W32-NEXT: s_add_i32 s1, s13, s1
+; GFX1030W32-NEXT: s_mul_i32 s14, s10, s0
+; GFX1030W32-NEXT: s_add_i32 s1, s1, s12
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s0, s14
+; GFX1030W32-NEXT: s_mul_hi_u32 s15, s9, s14
+; GFX1030W32-NEXT: s_mul_i32 s12, s9, s14
+; GFX1030W32-NEXT: s_mul_hi_u32 s14, s0, s1
+; GFX1030W32-NEXT: s_mul_i32 s0, s0, s1
+; GFX1030W32-NEXT: s_mul_hi_u32 s16, s9, s1
+; GFX1030W32-NEXT: s_add_u32 s0, s13, s0
; GFX1030W32-NEXT: s_addc_u32 s13, 0, s14
-; GFX1030W32-NEXT: s_add_u32 s1, s1, s12
-; GFX1030W32-NEXT: s_mul_i32 s11, s0, s11
-; GFX1030W32-NEXT: s_addc_u32 s1, s13, s15
+; GFX1030W32-NEXT: s_add_u32 s0, s0, s12
+; GFX1030W32-NEXT: s_mul_i32 s1, s9, s1
+; GFX1030W32-NEXT: s_addc_u32 s0, s13, s15
; GFX1030W32-NEXT: s_addc_u32 s12, s16, 0
-; GFX1030W32-NEXT: s_add_u32 s1, s1, s11
+; GFX1030W32-NEXT: s_add_u32 s13, s0, s1
+; GFX1030W32-NEXT: s_addc_u32 s0, 0, s12
+; GFX1030W32-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1030W32-NEXT: s_or_b32 s0, s0, s13
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, v0, s0
+; GFX1030W32-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1030W32-NEXT: s_addc_u32 s9, s9, s1
+; GFX1030W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1030W32-NEXT: s_mul_i32 s1, s10, s9
+; GFX1030W32-NEXT: s_mul_i32 s12, s10, s0
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s10, s0
+; GFX1030W32-NEXT: s_mul_i32 s11, s11, s0
+; GFX1030W32-NEXT: s_add_i32 s1, s10, s1
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s12
+; GFX1030W32-NEXT: s_add_i32 s1, s1, s11
+; GFX1030W32-NEXT: s_mul_i32 s10, s9, s12
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s0, s12
+; GFX1030W32-NEXT: s_mul_hi_u32 s12, s0, s1
+; GFX1030W32-NEXT: s_mul_i32 s0, s0, s1
+; GFX1030W32-NEXT: s_mul_hi_u32 s14, s9, s1
+; GFX1030W32-NEXT: s_add_u32 s0, s11, s0
; GFX1030W32-NEXT: s_addc_u32 s11, 0, s12
-; GFX1030W32-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX1030W32-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1030W32-NEXT: s_addc_u32 s0, s0, s11
-; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0
-; GFX1030W32-NEXT: s_mul_i32 s11, s9, s0
-; GFX1030W32-NEXT: s_mul_hi_u32 s12, s9, s1
-; GFX1030W32-NEXT: s_mul_i32 s10, s10, s1
-; GFX1030W32-NEXT: s_add_i32 s11, s12, s11
-; GFX1030W32-NEXT: s_mul_i32 s9, s9, s1
-; GFX1030W32-NEXT: s_add_i32 s11, s11, s10
-; GFX1030W32-NEXT: s_mul_hi_u32 s12, s0, s9
-; GFX1030W32-NEXT: s_mul_i32 s13, s0, s9
-; GFX1030W32-NEXT: s_mul_hi_u32 s9, s1, s9
-; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s11
-; GFX1030W32-NEXT: s_mul_i32 s1, s1, s11
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s0, s11
-; GFX1030W32-NEXT: s_add_u32 s1, s9, s1
-; GFX1030W32-NEXT: s_addc_u32 s9, 0, s14
-; GFX1030W32-NEXT: s_add_u32 s1, s1, s13
-; GFX1030W32-NEXT: s_mul_i32 s11, s0, s11
-; GFX1030W32-NEXT: s_addc_u32 s1, s9, s12
-; GFX1030W32-NEXT: s_addc_u32 s9, s10, 0
-; GFX1030W32-NEXT: s_add_u32 s1, s1, s11
-; GFX1030W32-NEXT: s_addc_u32 s9, 0, s9
-; GFX1030W32-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX1030W32-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1030W32-NEXT: s_addc_u32 s0, s0, s9
+; GFX1030W32-NEXT: s_add_u32 s0, s0, s10
+; GFX1030W32-NEXT: s_mul_i32 s1, s9, s1
+; GFX1030W32-NEXT: s_addc_u32 s0, s11, s13
+; GFX1030W32-NEXT: s_addc_u32 s10, s14, 0
+; GFX1030W32-NEXT: s_add_u32 s11, s0, s1
+; GFX1030W32-NEXT: s_addc_u32 s0, 0, s10
+; GFX1030W32-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1030W32-NEXT: s_or_b32 s0, s0, s11
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, v0, s0
+; GFX1030W32-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1030W32-NEXT: s_addc_u32 s0, s9, s1
; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0
; GFX1030W32-NEXT: s_mul_i32 s10, s6, s0
; GFX1030W32-NEXT: s_mul_hi_u32 s9, s6, s0
; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s0
; GFX1030W32-NEXT: s_mul_i32 s0, s7, s0
-; GFX1030W32-NEXT: s_mul_hi_u32 s12, s6, s1
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s1
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s6, s1
+; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s1
; GFX1030W32-NEXT: s_mul_i32 s1, s7, s1
-; GFX1030W32-NEXT: s_add_u32 s10, s12, s10
+; GFX1030W32-NEXT: s_add_u32 s10, s13, s10
; GFX1030W32-NEXT: s_addc_u32 s9, 0, s9
; GFX1030W32-NEXT: s_add_u32 s1, s10, s1
-; GFX1030W32-NEXT: s_addc_u32 s1, s9, s13
+; GFX1030W32-NEXT: s_addc_u32 s1, s9, s12
; GFX1030W32-NEXT: s_addc_u32 s9, s11, 0
-; GFX1030W32-NEXT: s_add_u32 s1, s1, s0
-; GFX1030W32-NEXT: s_addc_u32 s9, 0, s9
-; GFX1030W32-NEXT: s_mul_hi_u32 s0, s2, s1
-; GFX1030W32-NEXT: s_mul_i32 s11, s2, s9
+; GFX1030W32-NEXT: s_add_u32 s10, s1, s0
+; GFX1030W32-NEXT: s_addc_u32 s0, 0, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s10
+; GFX1030W32-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1030W32-NEXT: s_mul_i32 s13, s2, s10
; GFX1030W32-NEXT: s_mul_i32 s12, s2, s1
-; GFX1030W32-NEXT: s_add_i32 s0, s0, s11
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s11, s6, s12
-; GFX1030W32-NEXT: s_mul_i32 s10, s3, s1
-; GFX1030W32-NEXT: s_add_i32 s0, s0, s10
-; GFX1030W32-NEXT: v_sub_co_u32 v1, s12, v0, s2
-; GFX1030W32-NEXT: s_sub_i32 s10, s7, s0
-; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1030W32-NEXT: s_subb_u32 s10, s10, s3
+; GFX1030W32-NEXT: s_mul_i32 s11, s3, s10
+; GFX1030W32-NEXT: s_add_i32 s9, s9, s12
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s12, s6, s13
+; GFX1030W32-NEXT: s_add_i32 s9, s9, s11
+; GFX1030W32-NEXT: s_sub_i32 s11, s7, s9
+; GFX1030W32-NEXT: v_sub_co_u32 v1, s13, v0, s2
; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0
+; GFX1030W32-NEXT: s_subb_u32 s11, s11, s3
+; GFX1030W32-NEXT: s_cmp_lg_u32 s13, 0
; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
-; GFX1030W32-NEXT: s_subb_u32 s10, s10, 0
-; GFX1030W32-NEXT: s_cmp_ge_u32 s10, s3
+; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0
+; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s3
; GFX1030W32-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1030W32-NEXT: s_cmp_eq_u32 s10, s3
+; GFX1030W32-NEXT: s_cselect_b32 s13, -1, 0
+; GFX1030W32-NEXT: s_cmp_eq_u32 s11, s3
; GFX1030W32-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX1030W32-NEXT: s_add_u32 s10, s1, 1
-; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
-; GFX1030W32-NEXT: s_addc_u32 s12, s9, 0
-; GFX1030W32-NEXT: s_add_u32 s13, s1, 2
-; GFX1030W32-NEXT: s_addc_u32 s14, s9, 0
-; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1030W32-NEXT: s_or_b32 s10, s0, s10
+; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s13, v1, vcc_lo
+; GFX1030W32-NEXT: s_add_u32 s11, s10, 1
+; GFX1030W32-NEXT: s_addc_u32 s13, s1, 0
+; GFX1030W32-NEXT: s_add_u32 s0, s10, 2
+; GFX1030W32-NEXT: s_addc_u32 s14, s1, 0
+; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0
; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
-; GFX1030W32-NEXT: s_subb_u32 s0, s7, s0
-; GFX1030W32-NEXT: v_mov_b32_e32 v2, s13
-; GFX1030W32-NEXT: s_cmp_ge_u32 s0, s3
+; GFX1030W32-NEXT: s_subb_u32 s7, s7, s9
+; GFX1030W32-NEXT: v_mov_b32_e32 v2, s0
+; GFX1030W32-NEXT: s_cmp_ge_u32 s7, s3
; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX1030W32-NEXT: s_cselect_b32 s7, -1, 0
-; GFX1030W32-NEXT: s_cmp_eq_u32 s0, s3
+; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1030W32-NEXT: s_cmp_eq_u32 s7, s3
; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX1030W32-NEXT: s_cselect_b32 s0, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s14
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0
-; GFX1030W32-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo
-; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, s9, v0, s0
+; GFX1030W32-NEXT: v_cndmask_b32_e32 v2, s11, v2, vcc_lo
+; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s13, v1, vcc_lo
; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo
-; GFX1030W32-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo
+; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s1, v1, vcc_lo
+; GFX1030W32-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc_lo
; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX1030W32-NEXT: s_cbranch_vccnz .LBB16_3
; GFX1030W32-NEXT: .LBB16_2:
@@ -2601,8 +2631,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: ; %bb.1:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX1030W64-NEXT: s_sub_u32 s9, 0, s2
-; GFX1030W64-NEXT: s_subb_u32 s10, 0, s3
+; GFX1030W64-NEXT: s_sub_u32 s11, 0, s2
+; GFX1030W64-NEXT: s_subb_u32 s12, 0, s3
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0
; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2611,98 +2641,104 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1
-; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1030W64-NEXT: s_mul_i32 s1, s9, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s0
-; GFX1030W64-NEXT: s_mul_i32 s11, s10, s0
-; GFX1030W64-NEXT: s_add_i32 s1, s12, s1
-; GFX1030W64-NEXT: s_mul_i32 s13, s9, s0
-; GFX1030W64-NEXT: s_add_i32 s1, s1, s11
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13
-; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1
-; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1
-; GFX1030W64-NEXT: s_mul_hi_u32 s15, s8, s1
-; GFX1030W64-NEXT: s_add_u32 s0, s12, s0
-; GFX1030W64-NEXT: s_addc_u32 s12, 0, s13
-; GFX1030W64-NEXT: s_add_u32 s0, s0, s11
-; GFX1030W64-NEXT: s_mul_i32 s1, s8, s1
-; GFX1030W64-NEXT: s_addc_u32 s0, s12, s14
-; GFX1030W64-NEXT: s_addc_u32 s11, s15, 0
-; GFX1030W64-NEXT: s_add_u32 s0, s0, s1
-; GFX1030W64-NEXT: s_addc_u32 s11, 0, s11
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11
+; GFX1030W64-NEXT: v_readfirstlane_b32 s10, v1
; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1030W64-NEXT: s_mul_i32 s1, s9, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s9, s0
-; GFX1030W64-NEXT: s_mul_i32 s10, s10, s0
-; GFX1030W64-NEXT: s_add_i32 s1, s11, s1
-; GFX1030W64-NEXT: s_mul_i32 s9, s9, s0
-; GFX1030W64-NEXT: s_add_i32 s1, s1, s10
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s9
-; GFX1030W64-NEXT: s_mul_i32 s12, s8, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s9, s0, s9
+; GFX1030W64-NEXT: s_mul_i32 s1, s11, s10
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s11, s0
+; GFX1030W64-NEXT: s_mul_i32 s8, s12, s0
+; GFX1030W64-NEXT: s_add_i32 s1, s9, s1
+; GFX1030W64-NEXT: s_mul_i32 s13, s11, s0
+; GFX1030W64-NEXT: s_add_i32 s1, s1, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s0, s13
+; GFX1030W64-NEXT: s_mul_hi_u32 s14, s10, s13
+; GFX1030W64-NEXT: s_mul_i32 s8, s10, s13
; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1
; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1
-; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s1
+; GFX1030W64-NEXT: s_mul_hi_u32 s15, s10, s1
; GFX1030W64-NEXT: s_add_u32 s0, s9, s0
; GFX1030W64-NEXT: s_addc_u32 s9, 0, s13
-; GFX1030W64-NEXT: s_add_u32 s0, s0, s12
-; GFX1030W64-NEXT: s_mul_i32 s1, s8, s1
-; GFX1030W64-NEXT: s_addc_u32 s0, s9, s11
-; GFX1030W64-NEXT: s_addc_u32 s9, s10, 0
-; GFX1030W64-NEXT: s_add_u32 s0, s0, s1
-; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1030W64-NEXT: s_addc_u32 s0, s8, s9
+; GFX1030W64-NEXT: s_add_u32 s0, s0, s8
+; GFX1030W64-NEXT: s_mul_i32 s1, s10, s1
+; GFX1030W64-NEXT: s_addc_u32 s0, s9, s14
+; GFX1030W64-NEXT: s_addc_u32 s8, s15, 0
+; GFX1030W64-NEXT: s_add_u32 s9, s0, s1
+; GFX1030W64-NEXT: s_addc_u32 s0, 0, s8
+; GFX1030W64-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1030W64-NEXT: s_or_b32 s0, s0, s9
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[8:9], v0, s0
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_addc_u32 s10, s10, s1
+; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1030W64-NEXT: s_mul_i32 s1, s11, s10
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s11, s0
+; GFX1030W64-NEXT: s_mul_i32 s12, s12, s0
+; GFX1030W64-NEXT: s_add_i32 s1, s9, s1
+; GFX1030W64-NEXT: s_mul_i32 s8, s11, s0
+; GFX1030W64-NEXT: s_add_i32 s1, s1, s12
+; GFX1030W64-NEXT: s_mul_hi_u32 s11, s10, s8
+; GFX1030W64-NEXT: s_mul_i32 s9, s10, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s8, s0, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s0, s1
+; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s10, s1
+; GFX1030W64-NEXT: s_add_u32 s0, s8, s0
+; GFX1030W64-NEXT: s_addc_u32 s8, 0, s12
+; GFX1030W64-NEXT: s_add_u32 s0, s0, s9
+; GFX1030W64-NEXT: s_mul_i32 s1, s10, s1
+; GFX1030W64-NEXT: s_addc_u32 s0, s8, s11
+; GFX1030W64-NEXT: s_addc_u32 s8, s13, 0
+; GFX1030W64-NEXT: s_add_u32 s9, s0, s1
+; GFX1030W64-NEXT: s_addc_u32 s0, 0, s8
+; GFX1030W64-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1030W64-NEXT: s_or_b32 s0, s0, s9
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[8:9], v0, s0
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_addc_u32 s0, s10, s1
; GFX1030W64-NEXT: v_readfirstlane_b32 s1, v0
; GFX1030W64-NEXT: s_mul_i32 s9, s6, s0
; GFX1030W64-NEXT: s_mul_hi_u32 s8, s6, s0
; GFX1030W64-NEXT: s_mul_hi_u32 s10, s7, s0
; GFX1030W64-NEXT: s_mul_i32 s0, s7, s0
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s1
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s1
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s1
+; GFX1030W64-NEXT: s_mul_hi_u32 s11, s7, s1
; GFX1030W64-NEXT: s_mul_i32 s1, s7, s1
-; GFX1030W64-NEXT: s_add_u32 s9, s11, s9
+; GFX1030W64-NEXT: s_add_u32 s9, s12, s9
; GFX1030W64-NEXT: s_addc_u32 s8, 0, s8
; GFX1030W64-NEXT: s_add_u32 s1, s9, s1
-; GFX1030W64-NEXT: s_addc_u32 s1, s8, s12
+; GFX1030W64-NEXT: s_addc_u32 s1, s8, s11
; GFX1030W64-NEXT: s_addc_u32 s8, s10, 0
-; GFX1030W64-NEXT: s_add_u32 s10, s1, s0
-; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s0, s2, s10
-; GFX1030W64-NEXT: s_mul_i32 s1, s2, s11
-; GFX1030W64-NEXT: s_mul_i32 s9, s2, s10
-; GFX1030W64-NEXT: s_add_i32 s12, s0, s1
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s6, s9
-; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10
-; GFX1030W64-NEXT: s_add_i32 s12, s12, s8
-; GFX1030W64-NEXT: v_sub_co_u32 v1, s[8:9], v0, s2
-; GFX1030W64-NEXT: s_sub_i32 s13, s7, s12
+; GFX1030W64-NEXT: s_add_u32 s12, s1, s0
+; GFX1030W64-NEXT: s_addc_u32 s0, 0, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s1, s2, s12
+; GFX1030W64-NEXT: s_mul_i32 s11, s2, s12
+; GFX1030W64-NEXT: s_lshl_b64 s[8:9], s[0:1], 32
+; GFX1030W64-NEXT: s_mul_i32 s10, s3, s12
+; GFX1030W64-NEXT: s_mul_i32 s0, s2, s9
+; GFX1030W64-NEXT: s_add_i32 s13, s1, s0
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s6, s11
+; GFX1030W64-NEXT: s_add_i32 s13, s13, s10
+; GFX1030W64-NEXT: s_sub_i32 s14, s7, s13
+; GFX1030W64-NEXT: v_sub_co_u32 v1, s[10:11], v0, s2
; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1030W64-NEXT: s_subb_u32 s13, s13, s3
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_subb_u32 s14, s14, s3
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v1
-; GFX1030W64-NEXT: s_subb_u32 s8, s13, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s3
+; GFX1030W64-NEXT: s_subb_u32 s10, s14, 0
+; GFX1030W64-NEXT: s_cmp_ge_u32 s10, s3
; GFX1030W64-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s3
+; GFX1030W64-NEXT: s_cselect_b32 s11, -1, 0
+; GFX1030W64-NEXT: s_cmp_eq_u32 s10, s3
; GFX1030W64-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX1030W64-NEXT: s_add_u32 s8, s10, 1
-; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc
-; GFX1030W64-NEXT: s_addc_u32 s9, s11, 0
-; GFX1030W64-NEXT: s_add_u32 s13, s10, 2
-; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0
+; GFX1030W64-NEXT: s_or_b32 s8, s8, s12
+; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc
+; GFX1030W64-NEXT: s_add_u32 s10, s8, 1
+; GFX1030W64-NEXT: s_addc_u32 s11, s9, 0
+; GFX1030W64-NEXT: s_add_u32 s12, s8, 2
+; GFX1030W64-NEXT: s_addc_u32 s14, s9, 0
; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
-; GFX1030W64-NEXT: s_subb_u32 s0, s7, s12
-; GFX1030W64-NEXT: v_mov_b32_e32 v2, s13
+; GFX1030W64-NEXT: s_subb_u32 s0, s7, s13
+; GFX1030W64-NEXT: v_mov_b32_e32 v2, s12
; GFX1030W64-NEXT: s_cmp_ge_u32 s0, s3
; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0
@@ -2711,11 +2747,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s14
; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, s7, v0, s[0:1]
-; GFX1030W64-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc
-; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc
-; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX1030W64-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc
; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc
-; GFX1030W64-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc
+; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc
+; GFX1030W64-NEXT: v_cndmask_b32_e32 v0, s8, v2, vcc
; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3
; GFX1030W64-NEXT: .LBB16_2:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -2763,8 +2799,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: ; %bb.1:
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX11-NEXT: s_sub_u32 s9, 0, s2
-; GFX11-NEXT: s_subb_u32 s10, 0, s3
+; GFX11-NEXT: s_sub_u32 s10, 0, s2
+; GFX11-NEXT: s_subb_u32 s11, 0, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
; GFX11-NEXT: v_rcp_f32_e32 v0, v0
@@ -2778,120 +2814,129 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v1
-; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: v_readfirstlane_b32 s9, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_mul_i32 s11, s9, s0
-; GFX11-NEXT: s_mul_hi_u32 s13, s9, s1
-; GFX11-NEXT: s_mul_i32 s12, s10, s1
-; GFX11-NEXT: s_add_i32 s11, s13, s11
-; GFX11-NEXT: s_mul_i32 s14, s9, s1
-; GFX11-NEXT: s_add_i32 s11, s11, s12
-; GFX11-NEXT: s_mul_hi_u32 s13, s1, s14
-; GFX11-NEXT: s_mul_hi_u32 s15, s0, s14
-; GFX11-NEXT: s_mul_i32 s12, s0, s14
-; GFX11-NEXT: s_mul_hi_u32 s14, s1, s11
-; GFX11-NEXT: s_mul_i32 s1, s1, s11
-; GFX11-NEXT: s_mul_hi_u32 s16, s0, s11
-; GFX11-NEXT: s_add_u32 s1, s13, s1
+; GFX11-NEXT: s_mul_i32 s1, s10, s9
+; GFX11-NEXT: s_mul_hi_u32 s13, s10, s0
+; GFX11-NEXT: s_mul_i32 s12, s11, s0
+; GFX11-NEXT: s_add_i32 s1, s13, s1
+; GFX11-NEXT: s_mul_i32 s14, s10, s0
+; GFX11-NEXT: s_add_i32 s1, s1, s12
+; GFX11-NEXT: s_mul_hi_u32 s13, s0, s14
+; GFX11-NEXT: s_mul_hi_u32 s15, s9, s14
+; GFX11-NEXT: s_mul_i32 s12, s9, s14
+; GFX11-NEXT: s_mul_hi_u32 s14, s0, s1
+; GFX11-NEXT: s_mul_i32 s0, s0, s1
+; GFX11-NEXT: s_mul_hi_u32 s16, s9, s1
+; GFX11-NEXT: s_add_u32 s0, s13, s0
; GFX11-NEXT: s_addc_u32 s13, 0, s14
-; GFX11-NEXT: s_add_u32 s1, s1, s12
-; GFX11-NEXT: s_mul_i32 s11, s0, s11
-; GFX11-NEXT: s_addc_u32 s1, s13, s15
+; GFX11-NEXT: s_add_u32 s0, s0, s12
+; GFX11-NEXT: s_mul_i32 s1, s9, s1
+; GFX11-NEXT: s_addc_u32 s0, s13, s15
; GFX11-NEXT: s_addc_u32 s12, s16, 0
-; GFX11-NEXT: s_add_u32 s1, s1, s11
-; GFX11-NEXT: s_addc_u32 s11, 0, s12
-; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
+; GFX11-NEXT: s_add_u32 s13, s0, s1
+; GFX11-NEXT: s_addc_u32 s0, 0, s12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX11-NEXT: s_or_b32 s0, s0, s13
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s0
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_addc_u32 s9, s9, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_addc_u32 s0, s0, s11
-; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: s_mul_i32 s11, s9, s0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_mul_i32 s1, s10, s9
+; GFX11-NEXT: s_mul_i32 s12, s10, s0
+; GFX11-NEXT: s_mul_hi_u32 s10, s10, s0
+; GFX11-NEXT: s_mul_i32 s11, s11, s0
+; GFX11-NEXT: s_add_i32 s1, s10, s1
+; GFX11-NEXT: s_mul_hi_u32 s13, s9, s12
+; GFX11-NEXT: s_add_i32 s1, s1, s11
+; GFX11-NEXT: s_mul_i32 s10, s9, s12
+; GFX11-NEXT: s_mul_hi_u32 s11, s0, s12
+; GFX11-NEXT: s_mul_hi_u32 s12, s0, s1
+; GFX11-NEXT: s_mul_i32 s0, s0, s1
+; GFX11-NEXT: s_mul_hi_u32 s14, s9, s1
+; GFX11-NEXT: s_add_u32 s0, s11, s0
+; GFX11-NEXT: s_addc_u32 s11, 0, s12
+; GFX11-NEXT: s_add_u32 s0, s0, s10
+; GFX11-NEXT: s_mul_i32 s1, s9, s1
+; GFX11-NEXT: s_addc_u32 s0, s11, s13
+; GFX11-NEXT: s_addc_u32 s10, s14, 0
+; GFX11-NEXT: s_add_u32 s11, s0, s1
+; GFX11-NEXT: s_addc_u32 s0, 0, s10
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX11-NEXT: s_or_b32 s0, s0, s11
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s0
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_addc_u32 s0, s9, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: s_mul_hi_u32 s12, s9, s1
-; GFX11-NEXT: s_mul_i32 s10, s10, s1
-; GFX11-NEXT: s_add_i32 s11, s12, s11
-; GFX11-NEXT: s_mul_i32 s9, s9, s1
-; GFX11-NEXT: s_add_i32 s11, s11, s10
-; GFX11-NEXT: s_mul_hi_u32 s12, s0, s9
-; GFX11-NEXT: s_mul_i32 s13, s0, s9
-; GFX11-NEXT: s_mul_hi_u32 s9, s1, s9
-; GFX11-NEXT: s_mul_hi_u32 s14, s1, s11
-; GFX11-NEXT: s_mul_i32 s1, s1, s11
-; GFX11-NEXT: s_mul_hi_u32 s10, s0, s11
-; GFX11-NEXT: s_add_u32 s1, s9, s1
-; GFX11-NEXT: s_addc_u32 s9, 0, s14
-; GFX11-NEXT: s_add_u32 s1, s1, s13
-; GFX11-NEXT: s_mul_i32 s11, s0, s11
-; GFX11-NEXT: s_addc_u32 s1, s9, s12
-; GFX11-NEXT: s_addc_u32 s9, s10, 0
-; GFX11-NEXT: s_add_u32 s1, s1, s11
-; GFX11-NEXT: s_addc_u32 s9, 0, s9
-; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_addc_u32 s0, s0, s9
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s10, s6, s0
; GFX11-NEXT: s_mul_hi_u32 s9, s6, s0
; GFX11-NEXT: s_mul_hi_u32 s11, s7, s0
; GFX11-NEXT: s_mul_i32 s0, s7, s0
-; GFX11-NEXT: s_mul_hi_u32 s12, s6, s1
-; GFX11-NEXT: s_mul_hi_u32 s13, s7, s1
+; GFX11-NEXT: s_mul_hi_u32 s13, s6, s1
+; GFX11-NEXT: s_mul_hi_u32 s12, s7, s1
; GFX11-NEXT: s_mul_i32 s1, s7, s1
-; GFX11-NEXT: s_add_u32 s10, s12, s10
+; GFX11-NEXT: s_add_u32 s10, s13, s10
; GFX11-NEXT: s_addc_u32 s9, 0, s9
; GFX11-NEXT: s_add_u32 s1, s10, s1
-; GFX11-NEXT: s_addc_u32 s1, s9, s13
+; GFX11-NEXT: s_addc_u32 s1, s9, s12
; GFX11-NEXT: s_addc_u32 s9, s11, 0
-; GFX11-NEXT: s_add_u32 s1, s1, s0
-; GFX11-NEXT: s_addc_u32 s9, 0, s9
-; GFX11-NEXT: s_mul_hi_u32 s0, s2, s1
-; GFX11-NEXT: s_mul_i32 s11, s2, s9
+; GFX11-NEXT: s_add_u32 s10, s1, s0
+; GFX11-NEXT: s_addc_u32 s0, 0, s9
+; GFX11-NEXT: s_mul_hi_u32 s9, s2, s10
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX11-NEXT: s_mul_i32 s13, s2, s10
; GFX11-NEXT: s_mul_i32 s12, s2, s1
-; GFX11-NEXT: s_add_i32 s0, s0, s11
-; GFX11-NEXT: v_sub_co_u32 v0, s11, s6, s12
-; GFX11-NEXT: s_mul_i32 s10, s3, s1
+; GFX11-NEXT: s_mul_i32 s11, s3, s10
+; GFX11-NEXT: s_add_i32 s9, s9, s12
+; GFX11-NEXT: v_sub_co_u32 v0, s12, s6, s13
+; GFX11-NEXT: s_add_i32 s9, s9, s11
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_add_i32 s0, s0, s10
-; GFX11-NEXT: v_sub_co_u32 v1, s12, v0, s2
-; GFX11-NEXT: s_sub_i32 s10, s7, s0
-; GFX11-NEXT: s_cmp_lg_u32 s11, 0
-; GFX11-NEXT: s_subb_u32 s10, s10, s3
+; GFX11-NEXT: s_sub_i32 s11, s7, s9
+; GFX11-NEXT: v_sub_co_u32 v1, s13, v0, s2
; GFX11-NEXT: s_cmp_lg_u32 s12, 0
+; GFX11-NEXT: s_subb_u32 s11, s11, s3
+; GFX11-NEXT: s_cmp_lg_u32 s13, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
-; GFX11-NEXT: s_subb_u32 s10, s10, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_ge_u32 s10, s3
+; GFX11-NEXT: s_subb_u32 s11, s11, 0
+; GFX11-NEXT: s_cmp_ge_u32 s11, s3
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: s_cselect_b32 s12, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s10, s3
+; GFX11-NEXT: s_cselect_b32 s13, -1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s11, s3
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_add_u32 s10, s1, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
-; GFX11-NEXT: s_addc_u32 s12, s9, 0
-; GFX11-NEXT: s_add_u32 s13, s1, 2
-; GFX11-NEXT: s_addc_u32 s14, s9, 0
-; GFX11-NEXT: v_mov_b32_e32 v2, s13
-; GFX11-NEXT: s_cmp_lg_u32 s11, 0
+; GFX11-NEXT: s_or_b32 s10, s0, s10
+; GFX11-NEXT: v_cndmask_b32_e32 v1, s13, v1, vcc_lo
+; GFX11-NEXT: s_add_u32 s11, s10, 1
+; GFX11-NEXT: s_addc_u32 s13, s1, 0
+; GFX11-NEXT: s_add_u32 s0, s10, 2
+; GFX11-NEXT: s_addc_u32 s14, s1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: s_cmp_lg_u32 s12, 0
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
-; GFX11-NEXT: s_subb_u32 s0, s7, s0
+; GFX11-NEXT: s_subb_u32 s7, s7, s9
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_ge_u32 s0, s3
+; GFX11-NEXT: s_cmp_ge_u32 s7, s3
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: s_cselect_b32 s7, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s0, s3
+; GFX11-NEXT: s_cselect_b32 s9, -1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s7, s3
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: v_mov_b32_e32 v1, s14
-; GFX11-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, s9, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e32 v2, s11, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v1, s13, v1, vcc_lo
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v1, s1, v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc_lo
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
; GFX11-NEXT: s_cbranch_vccnz .LBB16_3
; GFX11-NEXT: .LBB16_2:
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index c27e44609c527f..4326c41c9377d5 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -174,12 +174,18 @@ define i64 @load_4xi16_noncombine(ptr addrspace(1) %p) #0 {
; GCN-LABEL: load_4xi16_noncombine:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_mov_b32 s4, 0xffff
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GCN-NEXT: v_bfi_b32 v0, s4, v2, v3
-; GCN-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GCN-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b64 v[2:3], 32, v[2:3]
+; GCN-NEXT: v_and_or_b32 v0, v0, s4, v4
+; GCN-NEXT: v_mov_b32_e32 v1, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(1) %p, i32 3
%gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
index 849348a7be53dd..f72e3ffc379b3a 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -14,6 +14,10 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s0, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: main:
diff --git a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
index 11795cca18daa2..70a19293d942cc 100644
--- a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
@@ -162,7 +162,8 @@ define i32 @known_bits_mul24() {
; GFX9-LABEL: known_bits_mul24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mul_i32_i24_e64 v0, 0, -7
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%r0 = call i32 @llvm.amdgcn.mul.i24(i32 0, i32 -7)
%r1 = shl i32 %r0, 2
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index e9dbce9026ca04..027bce8b42897c 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -260,18 +260,20 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
-; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v0
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
-; SI-NEXT: v_or_b32_e32 v1, v4, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x900, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0
@@ -298,18 +300,20 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
+; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
; VI-NEXT: v_add_u16_e32 v1, 9, v1
-; VI-NEXT: v_add_u16_e32 v3, 9, v0
+; VI-NEXT: v_lshrrev_b16_e32 v3, 8, v0
+; VI-NEXT: v_add_u16_e32 v4, 9, v0
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0
-; VI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; VI-NEXT: v_or_b32_e32 v1, v4, v1
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_e32 v1, v2, v1
+; VI-NEXT: v_or_b32_e32 v3, v3, v4
; VI-NEXT: v_add_u16_e32 v1, 0x900, v1
-; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
+; VI-NEXT: v_add_u16_e32 v3, 0x900, v3
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_or_b32_e32 v1, v2, v1
+; VI-NEXT: v_or_b32_e32 v1, v3, v1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
; VI-NEXT: s_endpgm
@@ -345,18 +349,20 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
-; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v0
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
-; SI-NEXT: v_or_b32_e32 v1, v4, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x900, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0
@@ -384,18 +390,20 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
+; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
; VI-NEXT: v_add_u16_e32 v1, 9, v1
-; VI-NEXT: v_add_u16_e32 v3, 9, v0
+; VI-NEXT: v_lshrrev_b16_e32 v3, 8, v0
+; VI-NEXT: v_add_u16_e32 v4, 9, v0
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0
-; VI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; VI-NEXT: v_or_b32_e32 v1, v4, v1
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_e32 v1, v2, v1
+; VI-NEXT: v_or_b32_e32 v3, v3, v4
; VI-NEXT: v_add_u16_e32 v1, 0x900, v1
-; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
+; VI-NEXT: v_add_u16_e32 v3, 0x900, v3
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_or_b32_e32 v1, v2, v1
+; VI-NEXT: v_or_b32_e32 v1, v3, v1
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index b6359f18169799..20ec2ddb884342 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -367,7 +367,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -378,7 +378,9 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
+; EG-NEXT: MOV * T2.X, T0.X,
+; EG-NEXT: MOV * T0.X, PV.X,
+; EG-NEXT: LSHR * T0.W, PV.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
@@ -457,50 +459,54 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
;
; EG-LABEL: v_ctpop_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 42, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T8.XY, T0.X, 0, #1
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: MOV T2.X, T0.X,
+; EG-NEXT: MOV * T3.X, T0.Y,
+; EG-NEXT: MOV T0.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: AND_INT * T0.W, PV.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
+; EG-NEXT: MOV T0.X, T3.X,
; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: LSHR * T0.W, T8.X, literal.x,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.X, T5.X,
-; EG-NEXT: AND_INT * T0.W, T8.Y, literal.x,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: AND_INT * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: LSHR * T0.W, T8.Y, literal.x,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
@@ -601,56 +607,64 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
;
; EG-LABEL: v_ctpop_v8i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 73, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1
+; EG-NEXT: ALU 84, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: LSHR * T0.W, T12.X, literal.x,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: MOV T6.X, T0.Z,
+; EG-NEXT: MOV * T7.X, T0.W,
+; EG-NEXT: MOV T2.X, T0.X,
+; EG-NEXT: MOV * T3.X, T0.Y,
+; EG-NEXT: MOV T0.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: LSHR * T0.W, PV.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT * T0.W, PV.W,
; EG-NEXT: LSHL T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.X, literal.x,
+; EG-NEXT: MOV T0.X, T3.X,
+; EG-NEXT: MOV * T0.Z, T6.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.Y, T7.X,
+; EG-NEXT: MOV * T4.X, T0.W,
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT: BCNT_INT T1.W, PS,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.X, T5.X,
-; EG-NEXT: LSHR * T0.W, T12.Y, literal.x,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: LSHR * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.Y, literal.x,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: AND_INT * T0.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.Y, PS, PV.W,
-; EG-NEXT: MOV T5.X, PV.Y,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
+; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.X, T8.X,
-; EG-NEXT: LSHR * T0.W, T12.Z, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
@@ -660,7 +674,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T8.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
@@ -668,7 +682,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T8.X, PV.W,
; EG-NEXT: MOV * T0.X, T9.X,
-; EG-NEXT: LSHR * T0.W, T12.W, literal.x,
+; EG-NEXT: LSHR * T0.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
@@ -678,17 +692,19 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T9.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T1.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: LSHR T12.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T9.X, PV.W,
-; EG-NEXT: MOV * T0.X, T4.X,
-; EG-NEXT: MOV * T0.Z, T8.X,
+; EG-NEXT: MOV T12.Y, T5.X,
+; EG-NEXT: MOV * T12.X, T4.X,
+; EG-NEXT: MOV * T12.W, PV.X,
+; EG-NEXT: MOV * T12.Z, T8.X,
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <8 x i16>, ptr addrspace(1) %in, i32 %tid
%val = load <8 x i16>, ptr addrspace(1) %in.gep, align 32
@@ -837,174 +853,198 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add
;
; EG-LABEL: v_ctpop_v16i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @8
-; EG-NEXT: ALU 114, @16, KC0[], KC1[]
-; EG-NEXT: ALU 34, @131, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1
+; EG-NEXT: ALU 2, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @10
+; EG-NEXT: ALU 3, @17, KC0[], KC1[]
+; EG-NEXT: TEX 0 @12
+; EG-NEXT: ALU 114, @21, KC0[], KC1[]
+; EG-NEXT: ALU 51, @136, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T0.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T20.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_128 T20.XYZW, T0.X, 16, #1
-; EG-NEXT: VTX_READ_128 T21.XYZW, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: Fetch clause starting at 10:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T0.X, 0, #1
+; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: LSHR * T0.W, T20.X, literal.x,
+; EG-NEXT: ALU clause starting at 17:
+; EG-NEXT: MOV T14.X, T20.Z,
+; EG-NEXT: MOV * T15.X, T20.W,
+; EG-NEXT: MOV T10.X, T20.X,
+; EG-NEXT: MOV * T11.X, T20.Y,
+; EG-NEXT: ALU clause starting at 21:
+; EG-NEXT: MOV T6.X, T0.Z,
+; EG-NEXT: MOV * T7.X, T0.W,
+; EG-NEXT: MOV T2.X, T0.X,
+; EG-NEXT: MOV * T3.X, T0.Y,
+; EG-NEXT: MOV T0.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: LSHR * T0.W, PV.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT * T0.W, PV.W,
; EG-NEXT: LSHL T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T20.X, literal.x,
+; EG-NEXT: MOV T0.X, T3.X,
+; EG-NEXT: MOV * T0.Z, T6.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T1.Y, T7.X,
+; EG-NEXT: MOV T1.Z, T10.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T11.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T14.X,
+; EG-NEXT: MOV * T2.Z, T15.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T4.X, T0.W,
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT: BCNT_INT T2.W, PS,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.X, T5.X,
-; EG-NEXT: LSHR * T0.W, T20.Y, literal.x,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: LSHR * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
+; EG-NEXT: OR_INT * T0.W, T2.W, PV.W,
; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T20.Y, literal.x,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: AND_INT * T0.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.Y, PS, PV.W,
-; EG-NEXT: MOV T5.X, PV.Y,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
+; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.X, T8.X,
-; EG-NEXT: LSHR * T0.W, T20.Z, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
+; EG-NEXT: OR_INT * T0.W, T2.W, PV.W,
; EG-NEXT: MOV * T8.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T20.Z, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T8.X, PV.W,
; EG-NEXT: MOV * T0.X, T9.X,
-; EG-NEXT: LSHR * T0.W, T20.W, literal.x,
+; EG-NEXT: LSHR * T0.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
+; EG-NEXT: OR_INT * T0.W, T2.W, PV.W,
; EG-NEXT: MOV * T9.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T20.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T1.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T9.X, PV.W,
; EG-NEXT: MOV * T0.X, T12.X,
-; EG-NEXT: LSHR * T1.W, T21.X, literal.x,
+; EG-NEXT: LSHR * T0.W, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
+; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
+; EG-NEXT: OR_INT * T0.W, T2.W, PV.W,
; EG-NEXT: MOV * T12.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T1.W, T21.X, literal.x,
+; EG-NEXT: AND_INT * T0.W, T1.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
+; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T12.X, PV.W,
; EG-NEXT: MOV * T0.X, T13.X,
-; EG-NEXT: LSHR * T1.W, T21.Y, literal.x,
+; EG-NEXT: LSHR * T0.W, T1.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
+; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
+; EG-NEXT: OR_INT * T0.W, T2.W, PV.W,
; EG-NEXT: MOV * T13.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T1.W, T21.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
+; EG-NEXT: BCNT_INT * T0.W, PV.W,
+; EG-NEXT: ALU clause starting at 136:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T20.Y, PS, PV.W,
-; EG-NEXT: MOV T13.X, PV.Y,
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
+; EG-NEXT: MOV T13.X, PV.W,
; EG-NEXT: MOV * T0.X, T16.X,
-; EG-NEXT: LSHR * T1.W, T21.Z, literal.x,
+; EG-NEXT: LSHR * T0.W, T2.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: BCNT_INT T0.W, PV.W,
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT: ALU clause starting at 131:
-; EG-NEXT: MOV * T16.X, T1.W,
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
+; EG-NEXT: MOV * T16.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T1.W, T21.Z, literal.x,
+; EG-NEXT: AND_INT * T0.W, T2.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
+; EG-NEXT: BCNT_INT T0.W, PV.W,
+; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T16.X, PV.W,
; EG-NEXT: MOV * T0.X, T17.X,
-; EG-NEXT: LSHR * T1.W, T21.W, literal.x,
+; EG-NEXT: LSHR * T0.W, T2.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: BCNT_INT T0.W, PV.W,
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T17.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT T1.W, T21.W, literal.x,
-; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T0.W, T2.Z, literal.x,
+; EG-NEXT: LSHR * T20.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
; EG-NEXT: AND_INT T0.Z, PV.X, literal.x,
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: BCNT_INT T0.W, PV.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: -65536(nan), 16(2.242078e-44)
-; EG-NEXT: LSHR T22.X, PS, literal.x,
-; EG-NEXT: OR_INT * T20.W, PV.Z, PV.W,
+; EG-NEXT: LSHR T0.X, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T17.X, PV.W,
-; EG-NEXT: MOV * T0.X, T4.X,
-; EG-NEXT: MOV * T0.Z, T8.X,
-; EG-NEXT: MOV T20.X, T12.X,
-; EG-NEXT: MOV * T20.Z, T16.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T21.Y, T5.X,
+; EG-NEXT: MOV * T21.X, T4.X,
+; EG-NEXT: MOV * T21.W, T9.X,
+; EG-NEXT: MOV * T21.Z, T8.X,
+; EG-NEXT: MOV * T22.Y, T13.X,
+; EG-NEXT: MOV T22.X, T12.X,
+; EG-NEXT: MOV * T22.W, T17.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T22.Z, T16.X,
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <16 x i16>, ptr addrspace(1) %in, i32 %tid
%val = load <16 x i16>, ptr addrspace(1) %in.gep, align 32
@@ -1587,11 +1627,11 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; EG-NEXT: ALU clause starting at 32:
; EG-NEXT: BCNT_INT * T0.X, T0.X,
; EG-NEXT: ALU clause starting at 33:
-; EG-NEXT: LSHL * T1.W, T0.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T2.W, T0.X, literal.y,
-; EG-NEXT: 24(3.363116e-44), 65535(9.183409e-41)
+; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL T1.X, PS, PV.W,
; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 028a28ed9a23b7..e1942f26729a24 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1397,9 +1397,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[12:13], s[4:5]
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3
-; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2
-; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
+; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
+; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s9, s3
@@ -1408,18 +1408,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4
-; SI-NEXT: v_or_b32_e32 v5, v5, v4
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v6, v3, v6
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v3
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; SI-NEXT: v_bfe_u32 v5, v5, 0, 16
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
+; SI-NEXT: v_alignbit_b32 v4, v5, v4, 24
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; SI-NEXT: v_mov_b32_e32 v3, v1
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24
-; SI-NEXT: v_or_b32_e32 v4, v4, v6
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0
; SI-NEXT: s_endpgm
@@ -1580,23 +1582,25 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4
+; SI-NEXT: v_bfe_u32 v7, v4, 8, 8
; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v7
+; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v5
; SI-NEXT: v_and_b32_e32 v0, 0xff, v4
-; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
-; SI-NEXT: v_and_b32_e32 v1, 0xff00, v5
-; SI-NEXT: v_or_b32_e32 v0, v6, v0
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v6
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1608,34 +1612,36 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: v_mov_b32_e32 v5, 9
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v6, 0x900
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_mov_b32 s2, s6
; VI-NEXT: s_mov_b32 s3, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4
+; VI-NEXT: v_lshrrev_b16_e32 v8, 8, v4
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
-; VI-NEXT: v_and_b32_e32 v6, 0xffffff00, v4
-; VI-NEXT: v_add_u16_e32 v4, 9, v4
+; VI-NEXT: v_add_u16_e32 v9, 9, v4
+; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_nop 0
-; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v5
-; VI-NEXT: v_add_u16_e32 v2, 9, v5
-; VI-NEXT: v_or_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_mov_b32_e32 v2, 0x900
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7
+; VI-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_e32 v0, 0x900, v0
-; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -1644,17 +1650,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b16 v3, 8, v0
; GFX10-NEXT: v_add_nc_u16 v4, v0, 9
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff00, v1
-; GFX10-NEXT: v_add_nc_u16 v1, v1, 9
-; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_add_nc_u16 v2, v2, 9
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
@@ -1675,25 +1683,27 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 9
; GFX9-NEXT: s_movk_i32 s4, 0x900
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v0, s[0:1]
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4
+; GFX9-NEXT: v_lshrrev_b16_e32 v8, 8, v4
; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4
-; GFX9-NEXT: v_add_u16_e32 v4, 9, v4
+; GFX9-NEXT: v_add_u16_e32 v9, 9, v4
+; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v6
-; GFX9-NEXT: v_add_u16_e32 v2, 9, v6
-; GFX9-NEXT: v_or_b32_sdwa v0, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v7
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0
; GFX9-NEXT: v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1710,20 +1720,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_add_nc_u16 v2, v0, 9
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff00, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u16 v3, v1, 9
+; GFX11-NEXT: v_lshrrev_b16 v3, 8, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add_nc_u16 v1, v1, 9
; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
+; GFX11-NEXT: v_lshlrev_b16 v4, 8, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v1, v4, v1
; GFX11-NEXT: v_mov_b32_e32 v4, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: v_add_nc_u16 v2, v2, 0x900
-; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u16 v1, v1, 0x900
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2
; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 8f31bb1fe0a81c..9843df02f33e07 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -364,8 +364,9 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000
+; GCN-NEXT: s_lshr_b32 s3, s3, 16
; GCN-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NEXT: s_lshl_b32 s3, s3, 16
; GCN-NEXT: s_or_b32 s2, s2, s3
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s4, s0
@@ -418,8 +419,10 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) {
; GCN-LABEL: divergent_vec_i16_LH:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, 0xffff
-; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: divergent_vec_i16_LH:
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
index 8c3155fc5c6ea8..1a0ff9e0c126d7 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
@@ -8,7 +8,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out,
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_i32 s2, s2, s3
-; GCN-NEXT: s_sext_i32_i8 s2, s2
+; GCN-NEXT: s_bfe_i32 s2, s2, 0x80000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
@@ -52,7 +52,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out,
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_i32 s2, s2, s3
-; GCN-NEXT: s_sext_i32_i16 s2, s2
+; GCN-NEXT: s_bfe_i32 s2, s2, 0x100000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 777a8f3fef1c17..c3d842fe37d3a4 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -403,26 +403,31 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %
define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 {
; CI-LABEL: read2_ptr_is_subreg_f32:
; CI: ; %bb.0:
+; CI-NEXT: v_add_i32_e32 v1, vcc, 8, v0
+; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8
+; CI-NEXT: ds_read_b32 v2, v0
+; CI-NEXT: ds_read_b32 v1, v1
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_add_f32_e32 v2, v1, v2
+; CI-NEXT: v_add_f32_e32 v2, v2, v1
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: read2_ptr_is_subreg_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8
+; GFX9-NEXT: v_add_lshl_u32 v1, v0, 8, 2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: ds_read_b32 v2, v0
+; GFX9-NEXT: ds_read_b32 v1, v1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%ptr.0 = insertelement <2 x ptr addrspace(3)> undef, ptr addrspace(3) @lds, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
index 72ee660dc2adb8..cddcc1848a439f 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
@@ -12,21 +12,21 @@ define i1 @extractloadi1(ptr %ptr, i32 %idx) {
; CHECK-NEXT: v_lshr_b32_e64 v2, s32, 6
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_bfe_u32 v2, v0, 1, 1
-; CHECK-NEXT: v_bfe_u32 v3, v0, 2, 2
-; CHECK-NEXT: v_bfe_u32 v4, v0, 3, 1
-; CHECK-NEXT: v_lshrrev_b32_e32 v5, 4, v0
-; CHECK-NEXT: v_bfe_u32 v6, v0, 5, 1
-; CHECK-NEXT: v_lshrrev_b32_e32 v7, 6, v0
-; CHECK-NEXT: v_lshrrev_b32_e32 v8, 7, v0
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 4, v0
+; CHECK-NEXT: v_bfe_u32 v3, v0, 1, 1
+; CHECK-NEXT: v_bfe_u32 v4, v0, 2, 2
+; CHECK-NEXT: v_bfe_u32 v5, v0, 3, 1
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 6, v0
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 7, v0
; CHECK-NEXT: buffer_store_byte v0, off, s[0:3], s32
-; CHECK-NEXT: buffer_store_byte v8, off, s[0:3], s32 offset:7
-; CHECK-NEXT: buffer_store_byte v7, off, s[0:3], s32 offset:6
-; CHECK-NEXT: buffer_store_byte v6, off, s[0:3], s32 offset:5
-; CHECK-NEXT: buffer_store_byte v5, off, s[0:3], s32 offset:4
-; CHECK-NEXT: buffer_store_byte v4, off, s[0:3], s32 offset:3
-; CHECK-NEXT: buffer_store_byte v3, off, s[0:3], s32 offset:2
-; CHECK-NEXT: buffer_store_byte v2, off, s[0:3], s32 offset:1
+; CHECK-NEXT: v_bfe_u32 v0, v2, 1, 1
+; CHECK-NEXT: buffer_store_byte v7, off, s[0:3], s32 offset:7
+; CHECK-NEXT: buffer_store_byte v6, off, s[0:3], s32 offset:6
+; CHECK-NEXT: buffer_store_byte v2, off, s[0:3], s32 offset:4
+; CHECK-NEXT: buffer_store_byte v5, off, s[0:3], s32 offset:3
+; CHECK-NEXT: buffer_store_byte v4, off, s[0:3], s32 offset:2
+; CHECK-NEXT: buffer_store_byte v3, off, s[0:3], s32 offset:1
+; CHECK-NEXT: buffer_store_byte v0, off, s[0:3], s32 offset:5
; CHECK-NEXT: buffer_load_ubyte v0, v1, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 6dabd8c0b83eae..17f7e545df6b81 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -13,9 +13,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -30,15 +30,16 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; SI-NEXT: v_bfe_u32 v3, v5, 0, 16
; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v3, v5, v3
+; SI-NEXT: v_or_b32_e32 v4, v4, v7
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB0_3
; SI-NEXT: s_branch .LBB0_4
; SI-NEXT: .LBB0_2:
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB0_3: ; %T
@@ -48,11 +49,11 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -62,30 +63,31 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
+; SI-NEXT: v_or_b32_e32 v4, v3, v1
+; SI-NEXT: v_bfe_u32 v3, v5, 0, 16
; SI-NEXT: .LBB0_4: ; %exit
-; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
-; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v0, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v1, v3, 0, 16
; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; SI-NEXT: v_mov_b32_e32 v3, 0xffff
; SI-NEXT: v_mov_b32_e32 v4, 0x8000
-; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v6, 1
-; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
+; SI-NEXT: v_mov_b32_e32 v5, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, -1, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v3, -1, v5, vcc
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v4
+; SI-NEXT: v_or_b32_e32 v2, v4, v2
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_bfe_u32 v3, v3, 0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_8xi16_extract_4xi16:
@@ -190,6 +192,8 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; SI-NEXT: v_bfe_u32 v2, v2, 0, 16
+; SI-NEXT: v_bfe_u32 v4, v4, 0, 16
; SI-NEXT: v_or_b32_e32 v3, v6, v3
; SI-NEXT: v_or_b32_e32 v5, v5, v7
; SI-NEXT: s_mov_b64 vcc, exec
@@ -214,18 +218,20 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v0
-; SI-NEXT: v_or_b32_e32 v5, v5, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; SI-NEXT: v_bfe_u32 v2, v0, 0, 16
+; SI-NEXT: v_or_b32_e32 v3, v3, v1
+; SI-NEXT: v_or_b32_e32 v5, v4, v5
+; SI-NEXT: v_bfe_u32 v4, v6, 0, 16
; SI-NEXT: .LBB1_4: ; %exit
; SI-NEXT: v_bfe_i32 v0, v5, 0, 16
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
@@ -233,20 +239,21 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; SI-NEXT: v_mov_b32_e32 v4, 0xffff
; SI-NEXT: v_mov_b32_e32 v5, 0x8000
-; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v7, 1
+; SI-NEXT: v_mov_b32_e32 v6, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, -1, v6, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, -1, v6, vcc
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v4
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_bfe_u32 v3, v4, 0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
@@ -350,10 +357,11 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_bfe_u32 v5, v5, 0, 16
; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v4, v4, v7
+; SI-NEXT: v_or_b32_e32 v4, v4, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v5
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: s_mov_b64 vcc, exec
@@ -387,6 +395,7 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_bfe_u32 v3, v3, 0, 16
; SI-NEXT: v_or_b32_e32 v0, v4, v0
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -499,9 +508,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -532,15 +541,16 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; SI-NEXT: v_bfe_u32 v3, v5, 0, 16
; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v3, v5, v3
+; SI-NEXT: v_or_b32_e32 v4, v4, v7
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB3_3
; SI-NEXT: s_branch .LBB3_4
; SI-NEXT: .LBB3_2:
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB3_3: ; %T
@@ -550,11 +560,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -580,30 +590,31 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
+; SI-NEXT: v_or_b32_e32 v4, v3, v1
+; SI-NEXT: v_bfe_u32 v3, v5, 0, 16
; SI-NEXT: .LBB3_4: ; %exit
-; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
-; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v0, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v1, v3, 0, 16
; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; SI-NEXT: v_mov_b32_e32 v3, 0xffff
; SI-NEXT: v_mov_b32_e32 v4, 0x8000
-; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v6, 1
-; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
+; SI-NEXT: v_mov_b32_e32 v5, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, -1, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v3, -1, v5, vcc
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v4
+; SI-NEXT: v_or_b32_e32 v2, v4, v2
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_bfe_u32 v3, v3, 0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_4xi16:
@@ -710,13 +721,13 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -734,18 +745,20 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_or_b32_e32 v2, v7, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4
+; SI-NEXT: v_bfe_u32 v2, v7, 0, 16
+; SI-NEXT: v_bfe_u32 v4, v4, 0, 16
; SI-NEXT: v_or_b32_e32 v3, v6, v3
+; SI-NEXT: v_or_b32_e32 v5, v5, v8
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB4_3
; SI-NEXT: s_branch .LBB4_4
; SI-NEXT: .LBB4_2:
-; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB4_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
@@ -760,55 +773,58 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:18 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:20 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:22 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:24 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:26 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:28 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; SI-NEXT: v_bfe_u32 v2, v2, 0, 16
+; SI-NEXT: v_or_b32_e32 v3, v3, v0
+; SI-NEXT: v_or_b32_e32 v5, v4, v1
+; SI-NEXT: v_bfe_u32 v4, v6, 0, 16
; SI-NEXT: .LBB4_4: ; %exit
-; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v0, v5, 0, 16
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v3, v3, 0, 16
; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
; SI-NEXT: v_mov_b32_e32 v4, 0xffff
; SI-NEXT: v_mov_b32_e32 v5, 0x8000
-; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v7, 1
+; SI-NEXT: v_mov_b32_e32 v6, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, -1, v6, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT: v_cndmask_b32_e32 v4, -1, v6, vcc
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_bfe_u32 v3, v4, 0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_4xi16_2:
@@ -940,10 +956,11 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_bfe_u32 v5, v5, 0, 16
; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v4, v4, v7
+; SI-NEXT: v_or_b32_e32 v4, v4, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v5
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: s_mov_b64 vcc, exec
@@ -993,6 +1010,7 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_bfe_u32 v3, v3, 0, 16
; SI-NEXT: v_or_b32_e32 v0, v4, v0
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -1211,11 +1229,11 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc
+; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1237,109 +1255,119 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: v_or_b32_e32 v3, v11, v2
-; SI-NEXT: v_or_b32_e32 v8, v8, v12
-; SI-NEXT: v_or_b32_e32 v2, v10, v13
-; SI-NEXT: v_or_b32_e32 v9, v9, v14
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v5
+; SI-NEXT: v_bfe_u32 v3, v7, 0, 16
+; SI-NEXT: v_bfe_u32 v4, v4, 0, 16
+; SI-NEXT: v_bfe_u32 v2, v6, 0, 16
+; SI-NEXT: v_bfe_u32 v7, v5, 0, 16
+; SI-NEXT: v_or_b32_e32 v6, v11, v12
+; SI-NEXT: v_or_b32_e32 v8, v8, v13
+; SI-NEXT: v_or_b32_e32 v5, v10, v14
+; SI-NEXT: v_or_b32_e32 v9, v9, v15
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB7_3
; SI-NEXT: s_branch .LBB7_4
; SI-NEXT: .LBB7_2:
; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB7_3: ; %T
; SI-NEXT: s_mov_b32 s39, 0xf000
; SI-NEXT: s_mov_b32 s36, s38
; SI-NEXT: s_mov_b32 s37, s38
-; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:18 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:20 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:22 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:24 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:26 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:28 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5
-; SI-NEXT: v_or_b32_e32 v3, v3, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; SI-NEXT: v_bfe_u32 v3, v3, 0, 16
+; SI-NEXT: v_bfe_u32 v4, v4, 0, 16
+; SI-NEXT: v_bfe_u32 v2, v2, 0, 16
+; SI-NEXT: v_or_b32_e32 v6, v6, v0
; SI-NEXT: v_or_b32_e32 v8, v8, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v10
-; SI-NEXT: v_or_b32_e32 v9, v9, v11
+; SI-NEXT: v_or_b32_e32 v5, v5, v9
+; SI-NEXT: v_or_b32_e32 v9, v7, v11
+; SI-NEXT: v_bfe_u32 v7, v10, 0, 16
; SI-NEXT: .LBB7_4: ; %exit
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; SI-NEXT: s_movk_i32 s34, 0x3800
; SI-NEXT: v_mov_b32_e32 v8, 0x3d00
; SI-NEXT: v_mov_b32_e32 v9, 0x3900
-; SI-NEXT: v_mov_b32_e32 v10, 0x3d000000
-; SI-NEXT: v_mov_b32_e32 v11, 0x39000000
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5
-; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7
+; SI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4
+; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6
-; SI-NEXT: v_cndmask_b32_e32 v12, v10, v11, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7
-; SI-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
+; SI-NEXT: v_cndmask_b32_e32 v10, v8, v9, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5
+; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2
; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4
-; SI-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; SI-NEXT: v_bfe_u32 v3, v2, 0, 16
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_or_b32_e32 v4, v5, v12
-; SI-NEXT: v_or_b32_e32 v6, v3, v7
-; SI-NEXT: v_or_b32_e32 v2, v2, v8
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; SI-NEXT: v_or_b32_e32 v4, v7, v8
+; SI-NEXT: v_or_b32_e32 v6, v6, v9
+; SI-NEXT: v_or_b32_e32 v2, v5, v11
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_alignbit_b32 v5, v6, v12, 16
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16
+; SI-NEXT: v_bfe_u32 v7, v10, 0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_8xi16_0:
@@ -1485,13 +1513,13 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
; SI-NEXT: s_mov_b32 s39, 0xf000
; SI-NEXT: s_mov_b32 s36, s38
; SI-NEXT: s_mov_b32 s37, s38
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc
+; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1517,31 +1545,35 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5
+; SI-NEXT: v_bfe_u32 v11, v11, 0, 16
+; SI-NEXT: v_bfe_u32 v9, v9, 0, 16
+; SI-NEXT: v_bfe_u32 v7, v7, 0, 16
+; SI-NEXT: v_bfe_u32 v5, v5, 0, 16
+; SI-NEXT: v_or_b32_e32 v10, v10, v2
+; SI-NEXT: v_or_b32_e32 v8, v8, v3
+; SI-NEXT: v_or_b32_e32 v12, v6, v12
+; SI-NEXT: v_or_b32_e32 v13, v4, v13
; SI-NEXT: v_cvt_f32_f16_e32 v2, v11
; SI-NEXT: v_cvt_f32_f16_e32 v3, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_or_b32_e32 v9, v10, v12
-; SI-NEXT: v_or_b32_e32 v8, v8, v13
-; SI-NEXT: v_or_b32_e32 v10, v7, v14
-; SI-NEXT: v_or_b32_e32 v11, v5, v15
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v12
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v13
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB8_3
; SI-NEXT: s_branch .LBB8_4
; SI-NEXT: .LBB8_2:
; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr2
@@ -1550,21 +1582,21 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
; SI-NEXT: s_mov_b32 s39, 0xf000
; SI-NEXT: s_mov_b32 s36, s38
; SI-NEXT: s_mov_b32 s37, s38
-; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1582,28 +1614,32 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_or_b32_e32 v0, v9, v0
-; SI-NEXT: v_or_b32_e32 v1, v8, v1
-; SI-NEXT: v_or_b32_e32 v8, v7, v10
-; SI-NEXT: v_or_b32_e32 v9, v5, v11
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; SI-NEXT: v_bfe_u32 v9, v9, 0, 16
+; SI-NEXT: v_bfe_u32 v7, v7, 0, 16
+; SI-NEXT: v_bfe_u32 v5, v5, 0, 16
+; SI-NEXT: v_bfe_u32 v12, v3, 0, 16
+; SI-NEXT: v_or_b32_e32 v0, v8, v0
+; SI-NEXT: v_or_b32_e32 v1, v6, v1
+; SI-NEXT: v_or_b32_e32 v8, v4, v10
+; SI-NEXT: v_or_b32_e32 v10, v2, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
; SI-NEXT: v_cvt_f32_f16_e32 v5, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v1
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v12
; SI-NEXT: .LBB8_4: ; %exit
; SI-NEXT: v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v8
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -1611,9 +1647,9 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
; SI-NEXT: v_mov_b32_e32 v9, 0x3f200000
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v10, v3
; SI-NEXT: v_cvt_f32_f16_e32 v11, v5
; SI-NEXT: v_cvt_f32_f16_e32 v12, v2
@@ -1621,11 +1657,11 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc
; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
-; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6
+; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7
; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4
; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7
+; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6
; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v10
; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index f34824cd6cefe1..34841c81cba88c 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -176,8 +176,14 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
-; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
+; CI-NEXT: s_and_b32 s4, s3, 0x7fff
+; CI-NEXT: s_bfe_u32 s3, s3, 0xf0010
+; CI-NEXT: s_and_b32 s5, s2, 0x7fff
+; CI-NEXT: s_bfe_u32 s2, s2, 0xf0010
+; CI-NEXT: s_lshl_b32 s3, s3, 16
+; CI-NEXT: s_lshl_b32 s2, s2, 16
+; CI-NEXT: s_or_b32 s3, s4, s3
+; CI-NEXT: s_or_b32 s2, s5, s2
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index f9694dcd89abfb..6336ad7b359902 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -29,6 +29,10 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-UNALIGNED-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_load_2xi16_align2:
@@ -225,6 +229,10 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-UNALIGNED-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_load_2xi16_align1:
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
index 99b163dc9753b7..68f2a633d63635 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
@@ -777,7 +777,8 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index b5440b9c38c9f2..413421f905efce 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -3153,6 +3153,10 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar
; SI-NEXT: s_xor_b32 s0, s0, 0x80008000
; SI-NEXT: s_cmp_eq_u32 s1, 1
; SI-NEXT: s_cselect_b32 s0, 0, s0
+; SI-NEXT: s_and_b32 s1, s0, 0xffff
+; SI-NEXT: s_lshr_b32 s0, s0, 16
+; SI-NEXT: s_lshl_b32 s0, s0, 16
+; SI-NEXT: s_or_b32 s0, s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 4364b32e62f8c9..2c368e045e2a01 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -303,12 +303,19 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; CI-NEXT: v_add_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_or_b32_e32 v0, v0, v1
-; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
+; CI-NEXT: v_readfirstlane_b32 s3, v1
+; CI-NEXT: v_readfirstlane_b32 s2, v0
+; CI-NEXT: s_lshl_b32 s3, s3, 16
+; CI-NEXT: s_or_b32 s2, s2, s3
+; CI-NEXT: s_or_b32 s2, s2, 0x80008000
+; CI-NEXT: s_and_b32 s3, s2, 0xffff
+; CI-NEXT: s_lshr_b32 s2, s2, 16
+; CI-NEXT: s_lshl_b32 s2, s2, 16
+; CI-NEXT: s_or_b32 s2, s3, s2
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -371,6 +378,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_or_b32 s2, s2, 0x80008000
+; CI-NEXT: s_and_b32 s3, s2, 0xffff
+; CI-NEXT: s_lshr_b32 s2, s2, 16
+; CI-NEXT: s_lshl_b32 s2, s2, 16
+; CI-NEXT: s_or_b32 s2, s3, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
@@ -420,18 +431,39 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
}
define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
-; CIVI-LABEL: fneg_fabs_v4f16:
-; CIVI: ; %bb.0:
-; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
-; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
-; CIVI-NEXT: v_mov_b32_e32 v3, s1
-; CIVI-NEXT: v_mov_b32_e32 v0, s2
-; CIVI-NEXT: v_mov_b32_e32 v1, s3
-; CIVI-NEXT: v_mov_b32_e32 v2, s0
-; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; CIVI-NEXT: s_endpgm
+; CI-LABEL: fneg_fabs_v4f16:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_or_b32 s3, s3, 0x80008000
+; CI-NEXT: s_or_b32 s2, s2, 0x80008000
+; CI-NEXT: s_and_b32 s4, s3, 0xffff
+; CI-NEXT: s_lshr_b32 s3, s3, 16
+; CI-NEXT: s_and_b32 s5, s2, 0xffff
+; CI-NEXT: s_lshr_b32 s2, s2, 16
+; CI-NEXT: s_lshl_b32 s3, s3, 16
+; CI-NEXT: s_lshl_b32 s2, s2, 16
+; CI-NEXT: s_or_b32 s3, s4, s3
+; CI-NEXT: s_or_b32 s2, s5, s2
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabs_v4f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_or_b32 s3, s3, 0x80008000
+; VI-NEXT: s_or_b32 s2, s2, 0x80008000
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fneg_fabs_v4f16:
; GFX9: ; %bb.0:
@@ -469,8 +501,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[4:5], 0x2
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s1, s0, 16
-; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
+; CI-NEXT: v_lshrrev_b32_e64 v1, 16, s0
+; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1
@@ -543,8 +575,12 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_or_b32 s1, s0, 0x80008000
-; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: s_and_b32 s2, s1, 0xffff
+; CI-NEXT: s_lshr_b32 s1, s1, 16
+; CI-NEXT: s_lshl_b32 s1, s1, 16
+; CI-NEXT: s_or_b32 s1, s2, s1
+; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_store_dword v[0:1], v4
; CI-NEXT: v_mov_b32_e32 v0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index cd1ec85eb8d0f3..f8d9e496888904 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -182,12 +182,12 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX7-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fneg_xor_select_v2i16:
@@ -756,7 +756,7 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1,
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: select_fneg_xor_select_v2i16:
@@ -908,10 +908,11 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, 1, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 31, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 31c1389c940208..052faaa55846e4 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -246,6 +246,10 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
+; CI-NEXT: s_and_b32 s3, s2, 0xffff
+; CI-NEXT: s_lshr_b32 s2, s2, 16
+; CI-NEXT: s_lshl_b32 s2, s2, 16
+; CI-NEXT: s_or_b32 s2, s3, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
@@ -294,19 +298,37 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
}
define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
-; CIVI-LABEL: s_fneg_v2f16_nonload:
-; CIVI: ; %bb.0:
-; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CIVI-NEXT: ;;#ASMSTART
-; CIVI-NEXT: ; def s2
-; CIVI-NEXT: ;;#ASMEND
-; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000
-; CIVI-NEXT: v_mov_b32_e32 v2, s2
-; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: v_mov_b32_e32 v0, s0
-; CIVI-NEXT: v_mov_b32_e32 v1, s1
-; CIVI-NEXT: flat_store_dword v[0:1], v2
-; CIVI-NEXT: s_endpgm
+; CI-LABEL: s_fneg_v2f16_nonload:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: ;;#ASMSTART
+; CI-NEXT: ; def s2
+; CI-NEXT: ;;#ASMEND
+; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
+; CI-NEXT: s_and_b32 s3, s2, 0xffff
+; CI-NEXT: s_lshr_b32 s2, s2, 16
+; CI-NEXT: s_lshl_b32 s2, s2, 16
+; CI-NEXT: s_or_b32 s2, s3, s2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_fneg_v2f16_nonload:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX8-NEXT: ;;#ASMSTART
+; GFX8-NEXT: ; def s2
+; GFX8-NEXT: ;;#ASMEND
+; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_fneg_v2f16_nonload:
; GFX9: ; %bb.0:
@@ -354,6 +376,10 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; CI-NEXT: flat_load_dword v2, v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -410,6 +436,10 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
+; CI-NEXT: s_and_b32 s3, s2, 0xffff
+; CI-NEXT: s_lshr_b32 s2, s2, 16
+; CI-NEXT: s_lshl_b32 s2, s2, 16
+; CI-NEXT: s_or_b32 s2, s3, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index c7677942719de1..e48185f2fb05dc 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -811,12 +811,12 @@ define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) {
; SI-LABEL: v_fneg_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 7c5f6d5e33efe7..cc28e72c648442 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -2532,16 +2532,17 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 31, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
-; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xbd800000, v0
+; VI-NEXT: v_sub_u32_e32 v0, vcc, -0.5, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 31, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0xbd800000, v0
+; GFX10-NEXT: v_add_lshl_u32 v0, v0, 3, 23
+; GFX10-NEXT: v_sub_nc_u32_e32 v0, -0.5, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt:
@@ -2549,8 +2550,8 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 31, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0
-; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0xbd800000, v0
+; GFX11-NEXT: v_add_lshl_u32 v0, v0, 3, 23
+; GFX11-NEXT: v_sub_nc_u32_e32 v0, -0.5, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cnt = and i64 %cnt_in, 31
%shl = shl i64 8, %cnt
@@ -3024,29 +3025,32 @@ define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v0, 20, v0
-; VI-NEXT: v_mov_b32_e32 v1, 0x36a00000
-; VI-NEXT: v_sub_u32_e64 v2, vcc, 0, 0
-; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v0, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v1, 20, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0x36a00000
+; VI-NEXT: v_sub_u32_e64 v0, vcc, 0, 0
+; VI-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; VI-NEXT: v_lshrrev_b64 v[1:2], 32, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0
-; GFX10-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 20, v0
+; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v1, vcc_lo
+; GFX10-NEXT: v_lshrrev_b64 v[1:2], 32, v[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 20, v0
-; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 20, v0
+; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v1, vcc_lo
+; GFX11-NEXT: v_lshrrev_b64 v[1:2], 32, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i32 1, %cnt
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 64063f65e288ff..aa558e4a93d812 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -244,9 +244,9 @@ define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) {
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T0.W, KC0[2].Z, literal.x, PV.W,
+; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 2139095040(INF), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, KC0[2].Z, literal.y,
; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38)
; EG-NEXT: OR_INT T1.W, PS, literal.x,
@@ -364,23 +364,24 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
;
; EG-LABEL: fp_to_sint_v2i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 75, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 76, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T1.W, KC0[2].W, literal.x, PV.W,
+; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[3].X, literal.x,
+; EG-NEXT: 2139095040(INF), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
-; EG-NEXT: BFE_UINT T0.W, KC0[3].X, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T2.W, PV.W, literal.z,
-; EG-NEXT: 8388607(1.175494e-38), 23(3.222986e-44)
-; EG-NEXT: -150(nan), 0(0.000000e+00)
-; EG-NEXT: SUB_INT T0.X, literal.x, PV.W,
-; EG-NEXT: SUB_INT T0.Y, literal.x, T1.W,
-; EG-NEXT: AND_INT T1.Z, PS, literal.y,
+; EG-NEXT: ADD_INT T2.W, PV.W, literal.y,
+; EG-NEXT: LSHR * T1.W, T1.W, literal.z,
+; EG-NEXT: 8388607(1.175494e-38), -150(nan)
+; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT: SUB_INT T0.X, literal.x, PS,
+; EG-NEXT: SUB_INT T0.Y, literal.x, T0.W,
+; EG-NEXT: AND_INT T1.Z, PV.W, literal.y,
; EG-NEXT: OR_INT T3.W, PV.Z, literal.z,
; EG-NEXT: AND_INT * T4.W, KC0[3].X, literal.w,
; EG-NEXT: 150(2.101948e-43), 31(4.344025e-44)
@@ -393,7 +394,7 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
; EG-NEXT: 8388608(1.175494e-38), 32(4.484155e-44)
; EG-NEXT: CNDE_INT T2.X, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T0.Y, PV.Z, PV.Y, 0.0,
-; EG-NEXT: ADD_INT T1.Z, T0.W, literal.x,
+; EG-NEXT: ADD_INT T1.Z, T1.W, literal.x,
; EG-NEXT: BIT_ALIGN_INT T4.W, 0.0, PV.X, T0.X,
; EG-NEXT: AND_INT * T5.W, T0.X, literal.y,
; EG-NEXT: -150(nan), 32(4.484155e-44)
@@ -404,39 +405,39 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
; EG-NEXT: LSHR * T4.W, T1.X, 1,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T3.X, T3.W, 1,
-; EG-NEXT: ADD_INT T3.Y, T0.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT T3.Y, T1.W, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT: LSHL T0.W, T1.X, PV.Z,
+; EG-NEXT: LSHL T1.W, T1.X, PV.Z,
; EG-NEXT: AND_INT * T2.W, T1.Z, literal.y,
; EG-NEXT: -127(nan), 32(4.484155e-44)
; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T4.Y, PS, PV.Z, PV.W,
; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x,
-; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, PV.X, T2.Y,
-; EG-NEXT: ADD_INT * T1.W, T1.W, literal.y,
+; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, PV.X, T2.Y,
+; EG-NEXT: ADD_INT * T0.W, T0.W, literal.y,
; EG-NEXT: 23(3.222986e-44), -127(nan)
; EG-NEXT: CNDE_INT T3.X, T0.Z, PV.W, T1.Y,
; EG-NEXT: SETGT_INT T1.Y, PS, literal.x,
; EG-NEXT: CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT: CNDE_INT T0.W, PV.Z, T0.X, PV.X,
+; EG-NEXT: CNDE_INT T1.W, PV.Z, T0.X, PV.X,
; EG-NEXT: ASHR * T2.W, KC0[3].X, literal.y,
; EG-NEXT: 23(3.222986e-44), 31(4.344025e-44)
; EG-NEXT: XOR_INT T0.X, PV.W, PS,
; EG-NEXT: XOR_INT T2.Y, PV.Z, PS,
; EG-NEXT: CNDE_INT T0.Z, PV.Y, 0.0, PV.X,
-; EG-NEXT: CNDE_INT T0.W, PV.Y, T2.X, T0.Y,
+; EG-NEXT: CNDE_INT T1.W, PV.Y, T2.X, T0.Y,
; EG-NEXT: ASHR * T3.W, KC0[2].W, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: XOR_INT T0.Y, PV.W, PS,
; EG-NEXT: XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT: SUB_INT T0.W, PV.Y, T2.W,
+; EG-NEXT: SUB_INT T1.W, PV.Y, T2.W,
; EG-NEXT: SUBB_UINT * T4.W, PV.X, T2.W,
; EG-NEXT: SUB_INT T1.Y, PV.W, PS,
; EG-NEXT: SETGT_INT T1.Z, 0.0, T3.Y,
-; EG-NEXT: SUB_INT T0.W, PV.Z, T3.W,
+; EG-NEXT: SUB_INT T1.W, PV.Z, T3.W,
; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T3.W,
; EG-NEXT: SUB_INT T0.Z, PV.W, PS,
-; EG-NEXT: SETGT_INT T0.W, 0.0, T1.W,
+; EG-NEXT: SETGT_INT T0.W, 0.0, T0.W,
; EG-NEXT: CNDE_INT * T1.W, PV.Z, PV.Y, 0.0,
; EG-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, 0.0,
; EG-NEXT: SUB_INT * T2.W, T0.X, T2.W,
@@ -567,167 +568,170 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
;
; EG-LABEL: fp_to_sint_v4i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 101, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 54, @108, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T2.X, 1
+; EG-NEXT: ALU 99, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 59, @106, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 6:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, KC0[4].X, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, KC0[4].X, literal.y,
+; EG-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
+; EG-NEXT: 2139095040(INF), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[4].X, literal.y,
; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38)
-; EG-NEXT: OR_INT T0.Z, PS, literal.x,
-; EG-NEXT: BFE_UINT T2.W, KC0[3].Z, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T3.W, PV.W, literal.z,
-; EG-NEXT: 8388608(1.175494e-38), 23(3.222986e-44)
-; EG-NEXT: -150(nan), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T0.Y, PV.W, literal.x,
-; EG-NEXT: AND_INT T1.Z, PS, literal.y,
-; EG-NEXT: NOT_INT T4.W, PS,
-; EG-NEXT: LSHR * T5.W, PV.Z, 1,
-; EG-NEXT: -127(nan), 31(4.344025e-44)
-; EG-NEXT: ADD_INT T0.X, T1.W, literal.x,
-; EG-NEXT: BIT_ALIGN_INT T1.Y, 0.0, PS, PV.W,
-; EG-NEXT: AND_INT T2.Z, T3.W, literal.y, BS:VEC_201
-; EG-NEXT: LSHL T3.W, T0.Z, PV.Z,
-; EG-NEXT: SUB_INT * T1.W, literal.z, T1.W,
-; EG-NEXT: -127(nan), 32(4.484155e-44)
-; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.X, PS, literal.x,
-; EG-NEXT: BIT_ALIGN_INT T2.Y, 0.0, T0.Z, PS,
-; EG-NEXT: AND_INT T0.Z, KC0[3].Z, literal.y,
-; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.Y, PV.W,
-; EG-NEXT: SETGT_INT * T4.W, PV.X, literal.z,
-; EG-NEXT: 32(4.484155e-44), 8388607(1.175494e-38)
-; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T2.X, PS, 0.0, PV.W,
-; EG-NEXT: OR_INT T1.Y, PV.Z, literal.x,
-; EG-NEXT: ADD_INT T0.Z, T2.W, literal.y,
-; EG-NEXT: CNDE_INT T1.W, PV.X, PV.Y, 0.0,
-; EG-NEXT: CNDE_INT * T3.W, T2.Z, T3.W, 0.0,
+; EG-NEXT: OR_INT T1.W, PS, literal.x,
+; EG-NEXT: ADD_INT * T2.W, PV.W, literal.y,
; EG-NEXT: 8388608(1.175494e-38), -150(nan)
-; EG-NEXT: CNDE_INT T1.X, T4.W, PV.W, PS,
-; EG-NEXT: ASHR T2.Y, KC0[4].X, literal.x,
-; EG-NEXT: AND_INT T1.Z, PV.Z, literal.x,
-; EG-NEXT: NOT_INT T1.W, PV.Z,
-; EG-NEXT: LSHR * T3.W, PV.Y, 1,
+; EG-NEXT: ADD_INT T0.X, T0.W, literal.x,
+; EG-NEXT: SUB_INT T0.Y, literal.y, T0.W,
+; EG-NEXT: AND_INT T0.Z, PS, literal.z,
+; EG-NEXT: NOT_INT T0.W, PS,
+; EG-NEXT: LSHR * T3.W, PV.W, 1,
+; EG-NEXT: -127(nan), 150(2.101948e-43)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT: LSHL T3.Y, T1.Y, PV.Z,
-; EG-NEXT: XOR_INT T1.Z, PV.X, PV.Y,
-; EG-NEXT: XOR_INT T1.W, T2.X, PV.Y,
-; EG-NEXT: SUB_INT * T2.W, literal.x, T2.W,
-; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.X, T0.Z, literal.x,
-; EG-NEXT: AND_INT T4.Y, PS, literal.x,
-; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, T1.Y, PS, BS:VEC_021/SCL_122
-; EG-NEXT: SUB_INT T1.W, PV.W, T2.Y,
-; EG-NEXT: SUBB_UINT * T2.W, PV.Z, T2.Y,
+; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
+; EG-NEXT: LSHL T1.Y, T1.W, PV.Z,
+; EG-NEXT: AND_INT T0.Z, T2.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT: SUB_INT T2.X, PV.W, PS,
-; EG-NEXT: CNDE_INT T1.Y, PV.Y, PV.Z, 0.0,
-; EG-NEXT: CNDE_INT T0.Z, PV.X, T3.Y, 0.0,
-; EG-NEXT: CNDE_INT T1.W, PV.X, T3.X, T3.Y, BS:VEC_021/SCL_122
-; EG-NEXT: SETGT_INT * T2.W, T0.Y, literal.x,
-; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.X, KC0[3].W, literal.x, T0.W,
-; EG-NEXT: AND_INT T3.Y, KC0[3].W, literal.y,
-; EG-NEXT: CNDE_INT T2.Z, PS, 0.0, PV.W,
-; EG-NEXT: CNDE_INT T1.W, PS, PV.Y, PV.Z,
-; EG-NEXT: ASHR * T2.W, KC0[3].Z, literal.z,
+; EG-NEXT: CNDE_INT T2.X, PS, PV.W, 0.0,
+; EG-NEXT: CNDE_INT T0.Y, PV.Z, PV.Y, 0.0,
+; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.X, PV.Y,
+; EG-NEXT: SETGT_INT T0.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Z, literal.y,
+; EG-NEXT: 23(3.222986e-44), 2139095040(INF)
+; EG-NEXT: LSHR T1.X, PS, literal.x,
+; EG-NEXT: AND_INT T1.Y, KC0[3].Z, literal.y,
+; EG-NEXT: CNDE_INT T0.Z, PV.W, 0.0, PV.Z,
+; EG-NEXT: CNDE_INT T0.W, PV.W, PV.X, PV.Y,
+; EG-NEXT: ASHR * T1.W, KC0[4].X, literal.z,
; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T3.X, KC0[3].Y, literal.x, T0.W,
-; EG-NEXT: XOR_INT T1.Y, PV.W, PS,
+; EG-NEXT: XOR_INT T0.Y, PV.W, PS,
; EG-NEXT: XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT: OR_INT T0.W, PV.Y, literal.y,
-; EG-NEXT: SUB_INT * T1.W, literal.z, PV.X,
+; EG-NEXT: OR_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, PV.X, literal.y,
+; EG-NEXT: 8388608(1.175494e-38), -150(nan)
+; EG-NEXT: NOT_INT T1.Y, PS,
+; EG-NEXT: LSHR T1.Z, PV.W, 1,
+; EG-NEXT: SUB_INT T3.W, PV.Z, T1.W,
+; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T1.W,
+; EG-NEXT: SUB_INT T2.X, PV.W, PS,
+; EG-NEXT: ADD_INT T2.Y, T1.X, literal.x,
+; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, PV.Z, PV.Y,
+; EG-NEXT: SUB_INT T3.W, literal.y, T1.X,
+; EG-NEXT: AND_INT * T4.W, T2.W, literal.z,
+; EG-NEXT: -127(nan), 150(2.101948e-43)
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.X, T0.W, PS,
+; EG-NEXT: AND_INT T1.Y, T2.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: AND_INT T1.Z, PV.W, literal.x,
+; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T0.W, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT: AND_INT * T2.W, KC0[3].W, literal.y,
+; EG-NEXT: 32(4.484155e-44), 2139095040(INF)
+; EG-NEXT: LSHR T3.X, PS, literal.x,
+; EG-NEXT: CNDE_INT T3.Y, PV.Z, PV.W, 0.0,
+; EG-NEXT: CNDE_INT T1.Z, PV.Y, PV.X, 0.0,
+; EG-NEXT: CNDE_INT T0.W, PV.Y, T0.Z, PV.X,
+; EG-NEXT: SETGT_INT * T2.W, T2.Y, literal.x,
+; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.X, KC0[3].W, literal.x,
+; EG-NEXT: CNDE_INT T1.Y, PS, 0.0, PV.W,
+; EG-NEXT: CNDE_INT T0.Z, PS, PV.Y, PV.Z,
+; EG-NEXT: ASHR T0.W, KC0[3].Z, literal.y,
+; EG-NEXT: AND_INT * T2.W, KC0[3].Y, literal.z,
+; EG-NEXT: 8388607(1.175494e-38), 31(4.344025e-44)
+; EG-NEXT: 2139095040(INF), 0(0.000000e+00)
+; EG-NEXT: LSHR T4.X, PS, literal.x,
+; EG-NEXT: XOR_INT T3.Y, PV.Z, PV.W,
+; EG-NEXT: XOR_INT T0.Z, PV.Y, PV.W,
+; EG-NEXT: OR_INT T2.W, PV.X, literal.y,
+; EG-NEXT: SUB_INT * T3.W, literal.z, T3.X,
; EG-NEXT: 23(3.222986e-44), 8388608(1.175494e-38)
; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, KC0[3].Y, literal.x,
-; EG-NEXT: AND_INT T3.Y, PS, literal.y,
-; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PV.W, PS,
-; EG-NEXT: SUB_INT T1.W, PV.Z, T2.W,
-; EG-NEXT: SUBB_UINT * T3.W, PV.Y, T2.W,
+; EG-NEXT: AND_INT T1.X, KC0[3].Y, literal.x,
+; EG-NEXT: AND_INT T1.Y, PS, literal.y,
+; EG-NEXT: BIT_ALIGN_INT T1.Z, 0.0, PV.W, PS,
+; EG-NEXT: SUB_INT T3.W, PV.Z, T0.W,
+; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T0.W,
; EG-NEXT: 8388607(1.175494e-38), 32(4.484155e-44)
; EG-NEXT: SUB_INT T5.X, PV.W, PS,
-; EG-NEXT: SETGT_INT T0.Y, 0.0, T0.Y,
+; EG-NEXT: SETGT_INT T2.Y, 0.0, T2.Y,
; EG-NEXT: CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
-; EG-NEXT: OR_INT T1.W, PV.X, literal.x,
-; EG-NEXT: ADD_INT * T3.W, T3.X, literal.y,
+; EG-NEXT: OR_INT T3.W, PV.X, literal.x,
+; EG-NEXT: ADD_INT * T4.W, T4.X, literal.y,
; EG-NEXT: 8388608(1.175494e-38), -150(nan)
-; EG-NEXT: ADD_INT T4.X, T3.X, literal.x,
-; EG-NEXT: SUB_INT T3.Y, literal.y, T3.X,
-; EG-NEXT: AND_INT T2.Z, PS, literal.z,
-; EG-NEXT: NOT_INT T4.W, PS,
-; EG-NEXT: LSHR * T5.W, PV.W, 1,
+; EG-NEXT: ADD_INT T1.X, T4.X, literal.x,
+; EG-NEXT: SUB_INT T1.Y, literal.y, T4.X,
+; EG-NEXT: AND_INT T1.Z, PS, literal.z,
+; EG-NEXT: NOT_INT T5.W, PS,
+; EG-NEXT: LSHR * T6.W, PV.W, 1,
; EG-NEXT: -127(nan), 150(2.101948e-43)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT: LSHL T4.Y, T1.W, PV.Z,
-; EG-NEXT: AND_INT T2.Z, T3.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT: AND_INT * T3.W, PV.Y, literal.x,
+; EG-NEXT: BIT_ALIGN_INT T4.X, 0.0, PS, PV.W,
+; EG-NEXT: LSHL T4.Y, T3.W, PV.Z,
+; EG-NEXT: AND_INT T1.Z, T4.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BIT_ALIGN_INT * T3.W, 0.0, T3.W, PV.Y, BS:VEC_021/SCL_122
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T6.X, T1.X, literal.x,
-; EG-NEXT: CNDE_INT T3.Y, PS, PV.W, 0.0,
-; EG-NEXT: CNDE_INT * T3.Z, PV.Z, PV.Y, 0.0,
-; EG-NEXT: -150(nan), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 108:
-; EG-NEXT: CNDE_INT T1.W, T2.Z, T3.X, T4.Y,
-; EG-NEXT: SETGT_INT * T3.W, T4.X, literal.x,
-; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T3.X, PS, 0.0, PV.W,
-; EG-NEXT: CNDE_INT T3.Y, PS, T3.Y, T3.Z,
-; EG-NEXT: AND_INT T2.Z, T6.X, literal.x,
-; EG-NEXT: NOT_INT T1.W, T6.X,
-; EG-NEXT: LSHR * T3.W, T0.W, 1,
+; EG-NEXT: ALU clause starting at 106:
+; EG-NEXT: AND_INT * T4.W, T1.Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T6.X, T3.X, literal.x,
+; EG-NEXT: CNDE_INT T1.Y, PV.W, T3.W, 0.0,
+; EG-NEXT: CNDE_INT T2.Z, T1.Z, T4.Y, 0.0,
+; EG-NEXT: CNDE_INT T3.W, T1.Z, T4.X, T4.Y,
+; EG-NEXT: SETGT_INT * T4.W, T1.X, literal.y,
+; EG-NEXT: -150(nan), 23(3.222986e-44)
+; EG-NEXT: CNDE_INT T4.X, PS, 0.0, PV.W,
+; EG-NEXT: CNDE_INT T1.Y, PS, PV.Y, PV.Z,
+; EG-NEXT: AND_INT T1.Z, PV.X, literal.x,
+; EG-NEXT: NOT_INT T3.W, PV.X,
+; EG-NEXT: LSHR * T4.W, T2.W, 1,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T7.X, KC0[3].Y, literal.x,
-; EG-NEXT: ADD_INT T4.Y, T1.X, literal.y,
-; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT: LSHL T0.W, T0.W, PV.Z,
-; EG-NEXT: AND_INT * T1.W, T6.X, literal.z,
+; EG-NEXT: ADD_INT T4.Y, T3.X, literal.y,
+; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
+; EG-NEXT: LSHL T2.W, T2.W, PV.Z,
+; EG-NEXT: AND_INT * T3.W, T6.X, literal.z,
; EG-NEXT: 31(4.344025e-44), -127(nan)
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0,
+; EG-NEXT: CNDE_INT T3.X, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T5.Y, PS, PV.Z, PV.W,
-; EG-NEXT: SETGT_INT T2.Z, PV.Y, literal.x,
-; EG-NEXT: XOR_INT T0.W, T3.Y, PV.X,
-; EG-NEXT: XOR_INT * T1.W, T3.X, PV.X,
+; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x,
+; EG-NEXT: XOR_INT T2.W, T1.Y, PV.X,
+; EG-NEXT: XOR_INT * T3.W, T4.X, PV.X,
; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: SUB_INT T3.X, PS, T7.X,
-; EG-NEXT: SUBB_UINT T3.Y, PV.W, T7.X,
-; EG-NEXT: CNDE_INT T3.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT: CNDE_INT T1.W, PV.Z, T0.Z, PV.X,
-; EG-NEXT: ASHR * T3.W, KC0[3].W, literal.x,
+; EG-NEXT: SUB_INT T4.X, PS, T7.X,
+; EG-NEXT: SUBB_UINT T1.Y, PV.W, T7.X,
+; EG-NEXT: CNDE_INT T2.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT: CNDE_INT T3.W, PV.Z, T0.Z, PV.X,
+; EG-NEXT: ASHR * T4.W, KC0[3].W, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: XOR_INT T1.X, PV.W, PS,
+; EG-NEXT: XOR_INT T3.X, PV.W, PS,
; EG-NEXT: XOR_INT T5.Y, PV.Z, PS,
; EG-NEXT: SUB_INT T0.Z, PV.X, PV.Y,
-; EG-NEXT: SETGT_INT T1.W, 0.0, T4.X, BS:VEC_021/SCL_122
-; EG-NEXT: CNDE_INT * T6.W, T0.Y, T5.X, 0.0,
+; EG-NEXT: SETGT_INT T3.W, 0.0, T1.X, BS:VEC_021/SCL_122
+; EG-NEXT: CNDE_INT * T6.W, T2.Y, T5.X, 0.0,
; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X,
; EG-NEXT: CNDE_INT T6.Y, PV.W, PV.Z, 0.0,
-; EG-NEXT: SUB_INT T0.Z, T1.Y, T2.W, BS:VEC_021/SCL_122
-; EG-NEXT: SUB_INT T2.W, PV.Y, T3.W,
-; EG-NEXT: SUBB_UINT * T4.W, PV.X, T3.W,
-; EG-NEXT: SUB_INT T3.X, PV.W, PS,
+; EG-NEXT: SUB_INT T0.Z, T3.Y, T0.W, BS:VEC_021/SCL_122
+; EG-NEXT: SUB_INT T0.W, PV.Y, T4.W,
+; EG-NEXT: SUBB_UINT * T5.W, PV.X, T4.W,
+; EG-NEXT: SUB_INT T1.X, PV.W, PS,
; EG-NEXT: SETGT_INT T1.Y, 0.0, T4.Y,
-; EG-NEXT: CNDE_INT T6.Z, T0.Y, PV.Z, 0.0,
-; EG-NEXT: SUB_INT T0.W, T0.W, T7.X, BS:VEC_021/SCL_122
-; EG-NEXT: CNDE_INT * T4.W, PV.X, T2.X, 0.0,
-; EG-NEXT: CNDE_INT T6.X, T1.W, PV.W, 0.0,
-; EG-NEXT: CNDE_INT T4.Y, PV.Y, PV.X, 0.0,
-; EG-NEXT: SUB_INT T0.W, T1.Z, T2.Y,
-; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT: CNDE_INT T6.Z, T2.Y, PV.Z, 0.0,
+; EG-NEXT: SUB_INT T0.W, T2.W, T7.X, BS:VEC_021/SCL_122
+; EG-NEXT: CNDE_INT * T2.W, PV.X, T2.X, 0.0,
+; EG-NEXT: CNDE_INT T6.X, T3.W, PV.W, 0.0,
+; EG-NEXT: CNDE_INT T2.Y, PV.Y, PV.X, 0.0,
+; EG-NEXT: SUB_INT T0.W, T0.Y, T1.W,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T4.Z, T0.X, PV.W, 0.0,
-; EG-NEXT: SUB_INT * T0.W, T1.X, T3.W, BS:VEC_120/SCL_212
-; EG-NEXT: CNDE_INT T4.X, T1.Y, PV.W, 0.0,
+; EG-NEXT: CNDE_INT T2.Z, T0.X, PV.W, 0.0,
+; EG-NEXT: SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212
+; EG-NEXT: CNDE_INT T2.X, T1.Y, PV.W, 0.0,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T0.X, PV.W, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index 5170f9c76db23d..923d3055140f46 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -191,9 +191,9 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T0.W, KC0[2].Z, literal.x, PV.W,
+; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 2139095040(INF), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, KC0[2].Z, literal.y,
; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38)
; EG-NEXT: OR_INT T1.W, PS, literal.x,
@@ -288,23 +288,24 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
;
; EG-LABEL: fp_to_uint_v2f32_to_v2i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 75, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 76, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T1.W, KC0[2].W, literal.x, PV.W,
+; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[3].X, literal.x,
+; EG-NEXT: 2139095040(INF), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
-; EG-NEXT: BFE_UINT T0.W, KC0[3].X, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T2.W, PV.W, literal.z,
-; EG-NEXT: 8388607(1.175494e-38), 23(3.222986e-44)
-; EG-NEXT: -150(nan), 0(0.000000e+00)
-; EG-NEXT: SUB_INT T0.X, literal.x, PV.W,
-; EG-NEXT: SUB_INT T0.Y, literal.x, T1.W,
-; EG-NEXT: AND_INT T1.Z, PS, literal.y,
+; EG-NEXT: ADD_INT T2.W, PV.W, literal.y,
+; EG-NEXT: LSHR * T1.W, T1.W, literal.z,
+; EG-NEXT: 8388607(1.175494e-38), -150(nan)
+; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT: SUB_INT T0.X, literal.x, PS,
+; EG-NEXT: SUB_INT T0.Y, literal.x, T0.W,
+; EG-NEXT: AND_INT T1.Z, PV.W, literal.y,
; EG-NEXT: OR_INT T3.W, PV.Z, literal.z,
; EG-NEXT: AND_INT * T4.W, KC0[3].X, literal.w,
; EG-NEXT: 150(2.101948e-43), 31(4.344025e-44)
@@ -317,7 +318,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
; EG-NEXT: 8388608(1.175494e-38), 32(4.484155e-44)
; EG-NEXT: CNDE_INT T2.X, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T0.Y, PV.Z, PV.Y, 0.0,
-; EG-NEXT: ADD_INT T1.Z, T0.W, literal.x,
+; EG-NEXT: ADD_INT T1.Z, T1.W, literal.x,
; EG-NEXT: BIT_ALIGN_INT T4.W, 0.0, PV.X, T0.X,
; EG-NEXT: AND_INT * T5.W, T0.X, literal.y,
; EG-NEXT: -150(nan), 32(4.484155e-44)
@@ -328,39 +329,39 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
; EG-NEXT: LSHR * T4.W, T1.X, 1,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T3.X, T3.W, 1,
-; EG-NEXT: ADD_INT T3.Y, T0.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT T3.Y, T1.W, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT: LSHL T0.W, T1.X, PV.Z,
+; EG-NEXT: LSHL T1.W, T1.X, PV.Z,
; EG-NEXT: AND_INT * T2.W, T1.Z, literal.y,
; EG-NEXT: -127(nan), 32(4.484155e-44)
; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T4.Y, PS, PV.Z, PV.W,
; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x,
-; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, PV.X, T2.Y,
-; EG-NEXT: ADD_INT * T1.W, T1.W, literal.y,
+; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, PV.X, T2.Y,
+; EG-NEXT: ADD_INT * T0.W, T0.W, literal.y,
; EG-NEXT: 23(3.222986e-44), -127(nan)
; EG-NEXT: CNDE_INT T3.X, T0.Z, PV.W, T1.Y,
; EG-NEXT: SETGT_INT T1.Y, PS, literal.x,
; EG-NEXT: CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT: CNDE_INT T0.W, PV.Z, T0.X, PV.X,
+; EG-NEXT: CNDE_INT T1.W, PV.Z, T0.X, PV.X,
; EG-NEXT: ASHR * T2.W, KC0[3].X, literal.y,
; EG-NEXT: 23(3.222986e-44), 31(4.344025e-44)
; EG-NEXT: XOR_INT T0.X, PV.W, PS,
; EG-NEXT: XOR_INT T2.Y, PV.Z, PS,
; EG-NEXT: CNDE_INT T0.Z, PV.Y, 0.0, PV.X,
-; EG-NEXT: CNDE_INT T0.W, PV.Y, T2.X, T0.Y,
+; EG-NEXT: CNDE_INT T1.W, PV.Y, T2.X, T0.Y,
; EG-NEXT: ASHR * T3.W, KC0[2].W, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: XOR_INT T0.Y, PV.W, PS,
; EG-NEXT: XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT: SUB_INT T0.W, PV.Y, T2.W,
+; EG-NEXT: SUB_INT T1.W, PV.Y, T2.W,
; EG-NEXT: SUBB_UINT * T4.W, PV.X, T2.W,
; EG-NEXT: SUB_INT T1.Y, PV.W, PS,
; EG-NEXT: SETGT_INT T1.Z, 0.0, T3.Y,
-; EG-NEXT: SUB_INT T0.W, PV.Z, T3.W,
+; EG-NEXT: SUB_INT T1.W, PV.Z, T3.W,
; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T3.W,
; EG-NEXT: SUB_INT T0.Z, PV.W, PS,
-; EG-NEXT: SETGT_INT T0.W, 0.0, T1.W,
+; EG-NEXT: SETGT_INT T0.W, 0.0, T0.W,
; EG-NEXT: CNDE_INT * T1.W, PV.Z, PV.Y, 0.0,
; EG-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, 0.0,
; EG-NEXT: SUB_INT * T2.W, T0.X, T2.W,
@@ -449,167 +450,170 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x
;
; EG-LABEL: fp_to_uint_v4f32_to_v4i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 101, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 54, @108, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T2.X, 1
+; EG-NEXT: ALU 99, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 59, @106, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 6:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, KC0[4].X, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, KC0[4].X, literal.y,
+; EG-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
+; EG-NEXT: 2139095040(INF), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[4].X, literal.y,
; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38)
-; EG-NEXT: OR_INT T0.Z, PS, literal.x,
-; EG-NEXT: BFE_UINT T2.W, KC0[3].Z, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T3.W, PV.W, literal.z,
-; EG-NEXT: 8388608(1.175494e-38), 23(3.222986e-44)
-; EG-NEXT: -150(nan), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T0.Y, PV.W, literal.x,
-; EG-NEXT: AND_INT T1.Z, PS, literal.y,
-; EG-NEXT: NOT_INT T4.W, PS,
-; EG-NEXT: LSHR * T5.W, PV.Z, 1,
-; EG-NEXT: -127(nan), 31(4.344025e-44)
-; EG-NEXT: ADD_INT T0.X, T1.W, literal.x,
-; EG-NEXT: BIT_ALIGN_INT T1.Y, 0.0, PS, PV.W,
-; EG-NEXT: AND_INT T2.Z, T3.W, literal.y, BS:VEC_201
-; EG-NEXT: LSHL T3.W, T0.Z, PV.Z,
-; EG-NEXT: SUB_INT * T1.W, literal.z, T1.W,
-; EG-NEXT: -127(nan), 32(4.484155e-44)
-; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.X, PS, literal.x,
-; EG-NEXT: BIT_ALIGN_INT T2.Y, 0.0, T0.Z, PS,
-; EG-NEXT: AND_INT T0.Z, KC0[3].Z, literal.y,
-; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.Y, PV.W,
-; EG-NEXT: SETGT_INT * T4.W, PV.X, literal.z,
-; EG-NEXT: 32(4.484155e-44), 8388607(1.175494e-38)
-; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T2.X, PS, 0.0, PV.W,
-; EG-NEXT: OR_INT T1.Y, PV.Z, literal.x,
-; EG-NEXT: ADD_INT T0.Z, T2.W, literal.y,
-; EG-NEXT: CNDE_INT T1.W, PV.X, PV.Y, 0.0,
-; EG-NEXT: CNDE_INT * T3.W, T2.Z, T3.W, 0.0,
+; EG-NEXT: OR_INT T1.W, PS, literal.x,
+; EG-NEXT: ADD_INT * T2.W, PV.W, literal.y,
; EG-NEXT: 8388608(1.175494e-38), -150(nan)
-; EG-NEXT: CNDE_INT T1.X, T4.W, PV.W, PS,
-; EG-NEXT: ASHR T2.Y, KC0[4].X, literal.x,
-; EG-NEXT: AND_INT T1.Z, PV.Z, literal.x,
-; EG-NEXT: NOT_INT T1.W, PV.Z,
-; EG-NEXT: LSHR * T3.W, PV.Y, 1,
+; EG-NEXT: ADD_INT T0.X, T0.W, literal.x,
+; EG-NEXT: SUB_INT T0.Y, literal.y, T0.W,
+; EG-NEXT: AND_INT T0.Z, PS, literal.z,
+; EG-NEXT: NOT_INT T0.W, PS,
+; EG-NEXT: LSHR * T3.W, PV.W, 1,
+; EG-NEXT: -127(nan), 150(2.101948e-43)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT: LSHL T3.Y, T1.Y, PV.Z,
-; EG-NEXT: XOR_INT T1.Z, PV.X, PV.Y,
-; EG-NEXT: XOR_INT T1.W, T2.X, PV.Y,
-; EG-NEXT: SUB_INT * T2.W, literal.x, T2.W,
-; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.X, T0.Z, literal.x,
-; EG-NEXT: AND_INT T4.Y, PS, literal.x,
-; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, T1.Y, PS, BS:VEC_021/SCL_122
-; EG-NEXT: SUB_INT T1.W, PV.W, T2.Y,
-; EG-NEXT: SUBB_UINT * T2.W, PV.Z, T2.Y,
+; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
+; EG-NEXT: LSHL T1.Y, T1.W, PV.Z,
+; EG-NEXT: AND_INT T0.Z, T2.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT: SUB_INT T2.X, PV.W, PS,
-; EG-NEXT: CNDE_INT T1.Y, PV.Y, PV.Z, 0.0,
-; EG-NEXT: CNDE_INT T0.Z, PV.X, T3.Y, 0.0,
-; EG-NEXT: CNDE_INT T1.W, PV.X, T3.X, T3.Y, BS:VEC_021/SCL_122
-; EG-NEXT: SETGT_INT * T2.W, T0.Y, literal.x,
-; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.X, KC0[3].W, literal.x, T0.W,
-; EG-NEXT: AND_INT T3.Y, KC0[3].W, literal.y,
-; EG-NEXT: CNDE_INT T2.Z, PS, 0.0, PV.W,
-; EG-NEXT: CNDE_INT T1.W, PS, PV.Y, PV.Z,
-; EG-NEXT: ASHR * T2.W, KC0[3].Z, literal.z,
+; EG-NEXT: CNDE_INT T2.X, PS, PV.W, 0.0,
+; EG-NEXT: CNDE_INT T0.Y, PV.Z, PV.Y, 0.0,
+; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.X, PV.Y,
+; EG-NEXT: SETGT_INT T0.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Z, literal.y,
+; EG-NEXT: 23(3.222986e-44), 2139095040(INF)
+; EG-NEXT: LSHR T1.X, PS, literal.x,
+; EG-NEXT: AND_INT T1.Y, KC0[3].Z, literal.y,
+; EG-NEXT: CNDE_INT T0.Z, PV.W, 0.0, PV.Z,
+; EG-NEXT: CNDE_INT T0.W, PV.W, PV.X, PV.Y,
+; EG-NEXT: ASHR * T1.W, KC0[4].X, literal.z,
; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T3.X, KC0[3].Y, literal.x, T0.W,
-; EG-NEXT: XOR_INT T1.Y, PV.W, PS,
+; EG-NEXT: XOR_INT T0.Y, PV.W, PS,
; EG-NEXT: XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT: OR_INT T0.W, PV.Y, literal.y,
-; EG-NEXT: SUB_INT * T1.W, literal.z, PV.X,
+; EG-NEXT: OR_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, PV.X, literal.y,
+; EG-NEXT: 8388608(1.175494e-38), -150(nan)
+; EG-NEXT: NOT_INT T1.Y, PS,
+; EG-NEXT: LSHR T1.Z, PV.W, 1,
+; EG-NEXT: SUB_INT T3.W, PV.Z, T1.W,
+; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T1.W,
+; EG-NEXT: SUB_INT T2.X, PV.W, PS,
+; EG-NEXT: ADD_INT T2.Y, T1.X, literal.x,
+; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, PV.Z, PV.Y,
+; EG-NEXT: SUB_INT T3.W, literal.y, T1.X,
+; EG-NEXT: AND_INT * T4.W, T2.W, literal.z,
+; EG-NEXT: -127(nan), 150(2.101948e-43)
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.X, T0.W, PS,
+; EG-NEXT: AND_INT T1.Y, T2.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: AND_INT T1.Z, PV.W, literal.x,
+; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T0.W, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT: AND_INT * T2.W, KC0[3].W, literal.y,
+; EG-NEXT: 32(4.484155e-44), 2139095040(INF)
+; EG-NEXT: LSHR T3.X, PS, literal.x,
+; EG-NEXT: CNDE_INT T3.Y, PV.Z, PV.W, 0.0,
+; EG-NEXT: CNDE_INT T1.Z, PV.Y, PV.X, 0.0,
+; EG-NEXT: CNDE_INT T0.W, PV.Y, T0.Z, PV.X,
+; EG-NEXT: SETGT_INT * T2.W, T2.Y, literal.x,
+; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.X, KC0[3].W, literal.x,
+; EG-NEXT: CNDE_INT T1.Y, PS, 0.0, PV.W,
+; EG-NEXT: CNDE_INT T0.Z, PS, PV.Y, PV.Z,
+; EG-NEXT: ASHR T0.W, KC0[3].Z, literal.y,
+; EG-NEXT: AND_INT * T2.W, KC0[3].Y, literal.z,
+; EG-NEXT: 8388607(1.175494e-38), 31(4.344025e-44)
+; EG-NEXT: 2139095040(INF), 0(0.000000e+00)
+; EG-NEXT: LSHR T4.X, PS, literal.x,
+; EG-NEXT: XOR_INT T3.Y, PV.Z, PV.W,
+; EG-NEXT: XOR_INT T0.Z, PV.Y, PV.W,
+; EG-NEXT: OR_INT T2.W, PV.X, literal.y,
+; EG-NEXT: SUB_INT * T3.W, literal.z, T3.X,
; EG-NEXT: 23(3.222986e-44), 8388608(1.175494e-38)
; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, KC0[3].Y, literal.x,
-; EG-NEXT: AND_INT T3.Y, PS, literal.y,
-; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PV.W, PS,
-; EG-NEXT: SUB_INT T1.W, PV.Z, T2.W,
-; EG-NEXT: SUBB_UINT * T3.W, PV.Y, T2.W,
+; EG-NEXT: AND_INT T1.X, KC0[3].Y, literal.x,
+; EG-NEXT: AND_INT T1.Y, PS, literal.y,
+; EG-NEXT: BIT_ALIGN_INT T1.Z, 0.0, PV.W, PS,
+; EG-NEXT: SUB_INT T3.W, PV.Z, T0.W,
+; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T0.W,
; EG-NEXT: 8388607(1.175494e-38), 32(4.484155e-44)
; EG-NEXT: SUB_INT T5.X, PV.W, PS,
-; EG-NEXT: SETGT_INT T0.Y, 0.0, T0.Y,
+; EG-NEXT: SETGT_INT T2.Y, 0.0, T2.Y,
; EG-NEXT: CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
-; EG-NEXT: OR_INT T1.W, PV.X, literal.x,
-; EG-NEXT: ADD_INT * T3.W, T3.X, literal.y,
+; EG-NEXT: OR_INT T3.W, PV.X, literal.x,
+; EG-NEXT: ADD_INT * T4.W, T4.X, literal.y,
; EG-NEXT: 8388608(1.175494e-38), -150(nan)
-; EG-NEXT: ADD_INT T4.X, T3.X, literal.x,
-; EG-NEXT: SUB_INT T3.Y, literal.y, T3.X,
-; EG-NEXT: AND_INT T2.Z, PS, literal.z,
-; EG-NEXT: NOT_INT T4.W, PS,
-; EG-NEXT: LSHR * T5.W, PV.W, 1,
+; EG-NEXT: ADD_INT T1.X, T4.X, literal.x,
+; EG-NEXT: SUB_INT T1.Y, literal.y, T4.X,
+; EG-NEXT: AND_INT T1.Z, PS, literal.z,
+; EG-NEXT: NOT_INT T5.W, PS,
+; EG-NEXT: LSHR * T6.W, PV.W, 1,
; EG-NEXT: -127(nan), 150(2.101948e-43)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT: LSHL T4.Y, T1.W, PV.Z,
-; EG-NEXT: AND_INT T2.Z, T3.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT: AND_INT * T3.W, PV.Y, literal.x,
+; EG-NEXT: BIT_ALIGN_INT T4.X, 0.0, PS, PV.W,
+; EG-NEXT: LSHL T4.Y, T3.W, PV.Z,
+; EG-NEXT: AND_INT T1.Z, T4.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BIT_ALIGN_INT * T3.W, 0.0, T3.W, PV.Y, BS:VEC_021/SCL_122
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T6.X, T1.X, literal.x,
-; EG-NEXT: CNDE_INT T3.Y, PS, PV.W, 0.0,
-; EG-NEXT: CNDE_INT * T3.Z, PV.Z, PV.Y, 0.0,
-; EG-NEXT: -150(nan), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 108:
-; EG-NEXT: CNDE_INT T1.W, T2.Z, T3.X, T4.Y,
-; EG-NEXT: SETGT_INT * T3.W, T4.X, literal.x,
-; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T3.X, PS, 0.0, PV.W,
-; EG-NEXT: CNDE_INT T3.Y, PS, T3.Y, T3.Z,
-; EG-NEXT: AND_INT T2.Z, T6.X, literal.x,
-; EG-NEXT: NOT_INT T1.W, T6.X,
-; EG-NEXT: LSHR * T3.W, T0.W, 1,
+; EG-NEXT: ALU clause starting at 106:
+; EG-NEXT: AND_INT * T4.W, T1.Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T6.X, T3.X, literal.x,
+; EG-NEXT: CNDE_INT T1.Y, PV.W, T3.W, 0.0,
+; EG-NEXT: CNDE_INT T2.Z, T1.Z, T4.Y, 0.0,
+; EG-NEXT: CNDE_INT T3.W, T1.Z, T4.X, T4.Y,
+; EG-NEXT: SETGT_INT * T4.W, T1.X, literal.y,
+; EG-NEXT: -150(nan), 23(3.222986e-44)
+; EG-NEXT: CNDE_INT T4.X, PS, 0.0, PV.W,
+; EG-NEXT: CNDE_INT T1.Y, PS, PV.Y, PV.Z,
+; EG-NEXT: AND_INT T1.Z, PV.X, literal.x,
+; EG-NEXT: NOT_INT T3.W, PV.X,
+; EG-NEXT: LSHR * T4.W, T2.W, 1,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T7.X, KC0[3].Y, literal.x,
-; EG-NEXT: ADD_INT T4.Y, T1.X, literal.y,
-; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT: LSHL T0.W, T0.W, PV.Z,
-; EG-NEXT: AND_INT * T1.W, T6.X, literal.z,
+; EG-NEXT: ADD_INT T4.Y, T3.X, literal.y,
+; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
+; EG-NEXT: LSHL T2.W, T2.W, PV.Z,
+; EG-NEXT: AND_INT * T3.W, T6.X, literal.z,
; EG-NEXT: 31(4.344025e-44), -127(nan)
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0,
+; EG-NEXT: CNDE_INT T3.X, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T5.Y, PS, PV.Z, PV.W,
-; EG-NEXT: SETGT_INT T2.Z, PV.Y, literal.x,
-; EG-NEXT: XOR_INT T0.W, T3.Y, PV.X,
-; EG-NEXT: XOR_INT * T1.W, T3.X, PV.X,
+; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x,
+; EG-NEXT: XOR_INT T2.W, T1.Y, PV.X,
+; EG-NEXT: XOR_INT * T3.W, T4.X, PV.X,
; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: SUB_INT T3.X, PS, T7.X,
-; EG-NEXT: SUBB_UINT T3.Y, PV.W, T7.X,
-; EG-NEXT: CNDE_INT T3.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT: CNDE_INT T1.W, PV.Z, T0.Z, PV.X,
-; EG-NEXT: ASHR * T3.W, KC0[3].W, literal.x,
+; EG-NEXT: SUB_INT T4.X, PS, T7.X,
+; EG-NEXT: SUBB_UINT T1.Y, PV.W, T7.X,
+; EG-NEXT: CNDE_INT T2.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT: CNDE_INT T3.W, PV.Z, T0.Z, PV.X,
+; EG-NEXT: ASHR * T4.W, KC0[3].W, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: XOR_INT T1.X, PV.W, PS,
+; EG-NEXT: XOR_INT T3.X, PV.W, PS,
; EG-NEXT: XOR_INT T5.Y, PV.Z, PS,
; EG-NEXT: SUB_INT T0.Z, PV.X, PV.Y,
-; EG-NEXT: SETGT_INT T1.W, 0.0, T4.X, BS:VEC_021/SCL_122
-; EG-NEXT: CNDE_INT * T6.W, T0.Y, T5.X, 0.0,
+; EG-NEXT: SETGT_INT T3.W, 0.0, T1.X, BS:VEC_021/SCL_122
+; EG-NEXT: CNDE_INT * T6.W, T2.Y, T5.X, 0.0,
; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X,
; EG-NEXT: CNDE_INT T6.Y, PV.W, PV.Z, 0.0,
-; EG-NEXT: SUB_INT T0.Z, T1.Y, T2.W, BS:VEC_021/SCL_122
-; EG-NEXT: SUB_INT T2.W, PV.Y, T3.W,
-; EG-NEXT: SUBB_UINT * T4.W, PV.X, T3.W,
-; EG-NEXT: SUB_INT T3.X, PV.W, PS,
+; EG-NEXT: SUB_INT T0.Z, T3.Y, T0.W, BS:VEC_021/SCL_122
+; EG-NEXT: SUB_INT T0.W, PV.Y, T4.W,
+; EG-NEXT: SUBB_UINT * T5.W, PV.X, T4.W,
+; EG-NEXT: SUB_INT T1.X, PV.W, PS,
; EG-NEXT: SETGT_INT T1.Y, 0.0, T4.Y,
-; EG-NEXT: CNDE_INT T6.Z, T0.Y, PV.Z, 0.0,
-; EG-NEXT: SUB_INT T0.W, T0.W, T7.X, BS:VEC_021/SCL_122
-; EG-NEXT: CNDE_INT * T4.W, PV.X, T2.X, 0.0,
-; EG-NEXT: CNDE_INT T6.X, T1.W, PV.W, 0.0,
-; EG-NEXT: CNDE_INT T4.Y, PV.Y, PV.X, 0.0,
-; EG-NEXT: SUB_INT T0.W, T1.Z, T2.Y,
-; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT: CNDE_INT T6.Z, T2.Y, PV.Z, 0.0,
+; EG-NEXT: SUB_INT T0.W, T2.W, T7.X, BS:VEC_021/SCL_122
+; EG-NEXT: CNDE_INT * T2.W, PV.X, T2.X, 0.0,
+; EG-NEXT: CNDE_INT T6.X, T3.W, PV.W, 0.0,
+; EG-NEXT: CNDE_INT T2.Y, PV.Y, PV.X, 0.0,
+; EG-NEXT: SUB_INT T0.W, T0.Y, T1.W,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T4.Z, T0.X, PV.W, 0.0,
-; EG-NEXT: SUB_INT * T0.W, T1.X, T3.W, BS:VEC_120/SCL_212
-; EG-NEXT: CNDE_INT T4.X, T1.Y, PV.W, 0.0,
+; EG-NEXT: CNDE_INT T2.Z, T0.X, PV.W, 0.0,
+; EG-NEXT: SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212
+; EG-NEXT: CNDE_INT T2.X, T1.Y, PV.W, 0.0,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T0.X, PV.W, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll
index d824763c22e27a..a8580f77e4b212 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll
@@ -14,15 +14,17 @@ define void @scalar(double %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v4, v6, v4
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; CHECK-NEXT: s_brev_b32 s4, 1
-; CHECK-NEXT: v_and_or_b32 v5, v1, s4, v4
-; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT: v_cndmask_b32_e32 v6, v4, v6, vcc
+; CHECK-NEXT: v_and_b32_e32 v5, 0x80000000, v1
+; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_lshrrev_b64 v[4:5], 32, v[4:5]
+; CHECK-NEXT: v_or_b32_e32 v4, v6, v4
+; CHECK-NEXT: v_bfe_u32 v5, v6, 16, 1
; CHECK-NEXT: s_movk_i32 s4, 0x7fff
-; CHECK-NEXT: v_add3_u32 v4, v4, v5, s4
-; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT: v_add3_u32 v5, v5, v4, s4
+; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; CHECK-NEXT: global_store_short_d16_hi v[2:3], v0, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -45,33 +47,37 @@ define void @v2(<2 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v6, v8, v6
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
-; CHECK-NEXT: s_brev_b32 s8, 1
-; CHECK-NEXT: v_and_or_b32 v7, v1, s8, v6
-; CHECK-NEXT: v_bfe_u32 v6, v6, 16, 1
-; CHECK-NEXT: s_movk_i32 s9, 0x7fff
-; CHECK-NEXT: v_add3_u32 v6, v6, v7, s9
+; CHECK-NEXT: v_cndmask_b32_e32 v10, v6, v8, vcc
+; CHECK-NEXT: v_and_b32_e32 v7, 0x80000000, v1
+; CHECK-NEXT: v_mov_b32_e32 v6, 0
+; CHECK-NEXT: v_lshrrev_b64 v[8:9], 32, v[6:7]
+; CHECK-NEXT: v_or_b32_e32 v7, v10, v8
+; CHECK-NEXT: v_bfe_u32 v8, v10, 16, 1
+; CHECK-NEXT: s_movk_i32 s8, 0x7fff
+; CHECK-NEXT: v_add3_u32 v8, v8, v7, s8
; CHECK-NEXT: v_or_b32_e32 v7, 0x400000, v7
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; CHECK-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v7, |v[2:3]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v7
-; CHECK-NEXT: v_and_b32_e32 v8, 1, v7
+; CHECK-NEXT: v_and_b32_e32 v9, 1, v7
; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v0, v7, v0
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v3, s8, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s9
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v9, v0, v7, vcc
+; CHECK-NEXT: v_and_b32_e32 v7, 0x80000000, v3
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[6:7]
+; CHECK-NEXT: v_or_b32_e32 v0, v9, v0
+; CHECK-NEXT: v_bfe_u32 v1, v9, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s8
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CHECK-NEXT: s_mov_b32 s4, 0x7060302
-; CHECK-NEXT: v_perm_b32 v0, v0, v6, s4
+; CHECK-NEXT: v_perm_b32 v0, v0, v8, s4
; CHECK-NEXT: global_store_dword v[4:5], v0, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -94,49 +100,55 @@ define void @v3(<3 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v8, v10, v8
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
-; CHECK-NEXT: s_brev_b32 s8, 1
-; CHECK-NEXT: v_and_or_b32 v9, v1, s8, v8
-; CHECK-NEXT: v_bfe_u32 v8, v8, 16, 1
-; CHECK-NEXT: s_movk_i32 s9, 0x7fff
-; CHECK-NEXT: v_add3_u32 v8, v8, v9, s9
+; CHECK-NEXT: v_cndmask_b32_e32 v12, v8, v10, vcc
+; CHECK-NEXT: v_and_b32_e32 v9, 0x80000000, v1
+; CHECK-NEXT: v_mov_b32_e32 v8, 0
+; CHECK-NEXT: v_lshrrev_b64 v[10:11], 32, v[8:9]
+; CHECK-NEXT: v_or_b32_e32 v9, v12, v10
+; CHECK-NEXT: v_bfe_u32 v10, v12, 16, 1
+; CHECK-NEXT: s_movk_i32 s8, 0x7fff
+; CHECK-NEXT: v_add3_u32 v10, v10, v9, s8
; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v9
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v9, |v[2:3]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
-; CHECK-NEXT: v_and_b32_e32 v10, 1, v9
+; CHECK-NEXT: v_and_b32_e32 v11, 1, v9
; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v0, v9, v0
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v3, s8, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s9
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v11, v0, v9, vcc
+; CHECK-NEXT: v_and_b32_e32 v9, 0x80000000, v3
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[8:9]
+; CHECK-NEXT: v_or_b32_e32 v0, v11, v0
+; CHECK-NEXT: v_bfe_u32 v1, v11, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s8
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CHECK-NEXT: s_mov_b32 s4, 0x7060302
; CHECK-NEXT: v_cvt_f32_f64_e64 v3, |v[4:5]|
-; CHECK-NEXT: v_perm_b32 v2, v0, v8, s4
+; CHECK-NEXT: v_perm_b32 v2, v0, v10, s4
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
-; CHECK-NEXT: v_and_b32_e32 v8, 1, v3
+; CHECK-NEXT: v_and_b32_e32 v9, 1, v3
; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[0:1]
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[4:5]|, v[0:1]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v0, v3, v0
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v5, s8, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s9
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: v_and_b32_e32 v9, 0x80000000, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[8:9]
+; CHECK-NEXT: v_or_b32_e32 v0, v3, v0
+; CHECK-NEXT: v_bfe_u32 v1, v3, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s8
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CHECK-NEXT: global_store_short_d16_hi v[6:7], v0, off offset:4
; CHECK-NEXT: global_store_dword v[6:7], v2, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -160,46 +172,52 @@ define void @v4(<4 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v10, v12, v10
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc
-; CHECK-NEXT: s_brev_b32 s8, 1
-; CHECK-NEXT: v_and_or_b32 v11, v5, s8, v10
-; CHECK-NEXT: v_bfe_u32 v10, v10, 16, 1
-; CHECK-NEXT: s_movk_i32 s9, 0x7fff
-; CHECK-NEXT: v_add3_u32 v10, v10, v11, s9
+; CHECK-NEXT: v_cndmask_b32_e32 v14, v10, v12, vcc
+; CHECK-NEXT: v_and_b32_e32 v11, 0x80000000, v5
+; CHECK-NEXT: v_mov_b32_e32 v10, 0
+; CHECK-NEXT: v_lshrrev_b64 v[12:13], 32, v[10:11]
+; CHECK-NEXT: v_or_b32_e32 v11, v14, v12
+; CHECK-NEXT: v_bfe_u32 v12, v14, 16, 1
+; CHECK-NEXT: s_movk_i32 s8, 0x7fff
+; CHECK-NEXT: v_add3_u32 v12, v12, v11, s8
; CHECK-NEXT: v_or_b32_e32 v11, 0x400000, v11
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
-; CHECK-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v11, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v11, |v[6:7]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v11
-; CHECK-NEXT: v_and_b32_e32 v12, 1, v11
+; CHECK-NEXT: v_and_b32_e32 v13, 1, v11
; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, v[4:5]
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[6:7]|, v[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v4, v11, v4
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc
-; CHECK-NEXT: v_and_or_b32 v5, v7, s8, v4
-; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
-; CHECK-NEXT: v_add3_u32 v4, v4, v5, s9
-; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v13, v4, v11, vcc
+; CHECK-NEXT: v_and_b32_e32 v11, 0x80000000, v7
+; CHECK-NEXT: v_lshrrev_b64 v[4:5], 32, v[10:11]
+; CHECK-NEXT: v_or_b32_e32 v4, v13, v4
+; CHECK-NEXT: v_bfe_u32 v5, v13, 16, 1
+; CHECK-NEXT: v_add3_u32 v5, v5, v4, s8
+; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT: s_mov_b32 s10, 0x7060302
-; CHECK-NEXT: v_perm_b32 v5, v4, v10, s10
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; CHECK-NEXT: s_mov_b32 s9, 0x7060302
+; CHECK-NEXT: v_perm_b32 v5, v4, v12, s9
; CHECK-NEXT: v_cvt_f32_f64_e64 v4, |v[0:1]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
-; CHECK-NEXT: v_and_b32_e32 v10, 1, v4
+; CHECK-NEXT: v_and_b32_e32 v11, 1, v4
; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[6:7]
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[6:7]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
; CHECK-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v6, v4, v6
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_and_b32_e32 v11, 0x80000000, v1
; CHECK-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; CHECK-NEXT: v_and_or_b32 v6, v1, s8, v4
+; CHECK-NEXT: v_lshrrev_b64 v[6:7], 32, v[10:11]
+; CHECK-NEXT: v_or_b32_e32 v6, v4, v6
; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
-; CHECK-NEXT: v_add3_u32 v4, v4, v6, s9
+; CHECK-NEXT: v_add3_u32 v4, v4, v6, s8
; CHECK-NEXT: v_or_b32_e32 v6, 0x400000, v6
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
@@ -212,14 +230,16 @@ define void @v4(<4 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v0, v6, v0
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v3, s8, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s9
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: v_and_b32_e32 v11, 0x80000000, v3
+; CHECK-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[10:11]
+; CHECK-NEXT: v_or_b32_e32 v0, v6, v0
+; CHECK-NEXT: v_bfe_u32 v1, v6, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s8
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: v_perm_b32 v4, v0, v4, s10
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; CHECK-NEXT: v_perm_b32 v4, v0, v4, s9
; CHECK-NEXT: global_store_dwordx2 v[8:9], v[4:5], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -242,46 +262,52 @@ define void @v8(<8 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v18, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v18, v20, v18
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v18, v18, v20, vcc
-; CHECK-NEXT: s_brev_b32 s8, 1
-; CHECK-NEXT: v_and_or_b32 v19, v13, s8, v18
-; CHECK-NEXT: v_bfe_u32 v18, v18, 16, 1
-; CHECK-NEXT: s_movk_i32 s9, 0x7fff
-; CHECK-NEXT: v_add3_u32 v18, v18, v19, s9
+; CHECK-NEXT: v_cndmask_b32_e32 v22, v18, v20, vcc
+; CHECK-NEXT: v_and_b32_e32 v19, 0x80000000, v13
+; CHECK-NEXT: v_mov_b32_e32 v18, 0
+; CHECK-NEXT: v_lshrrev_b64 v[20:21], 32, v[18:19]
+; CHECK-NEXT: v_or_b32_e32 v19, v22, v20
+; CHECK-NEXT: v_bfe_u32 v20, v22, 16, 1
+; CHECK-NEXT: s_movk_i32 s8, 0x7fff
+; CHECK-NEXT: v_add3_u32 v20, v20, v19, s8
; CHECK-NEXT: v_or_b32_e32 v19, 0x400000, v19
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[12:13]
-; CHECK-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v20, v20, v19, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v19, |v[14:15]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[12:13], v19
-; CHECK-NEXT: v_and_b32_e32 v20, 1, v19
+; CHECK-NEXT: v_and_b32_e32 v21, 1, v19
; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[14:15]|, v[12:13]
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[14:15]|, v[12:13]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21
; CHECK-NEXT: v_cndmask_b32_e64 v12, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v12, v19, v12
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v19, vcc
-; CHECK-NEXT: v_and_or_b32 v13, v15, s8, v12
-; CHECK-NEXT: v_bfe_u32 v12, v12, 16, 1
-; CHECK-NEXT: v_add3_u32 v12, v12, v13, s9
-; CHECK-NEXT: v_or_b32_e32 v13, 0x400000, v13
+; CHECK-NEXT: v_cndmask_b32_e32 v21, v12, v19, vcc
+; CHECK-NEXT: v_and_b32_e32 v19, 0x80000000, v15
+; CHECK-NEXT: v_lshrrev_b64 v[12:13], 32, v[18:19]
+; CHECK-NEXT: v_or_b32_e32 v12, v21, v12
+; CHECK-NEXT: v_bfe_u32 v13, v21, 16, 1
+; CHECK-NEXT: v_add3_u32 v13, v13, v12, s8
+; CHECK-NEXT: v_or_b32_e32 v12, 0x400000, v12
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[14:15]
-; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc
-; CHECK-NEXT: s_mov_b32 s10, 0x7060302
-; CHECK-NEXT: v_perm_b32 v13, v12, v18, s10
+; CHECK-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; CHECK-NEXT: s_mov_b32 s9, 0x7060302
+; CHECK-NEXT: v_perm_b32 v13, v12, v20, s9
; CHECK-NEXT: v_cvt_f32_f64_e64 v12, |v[8:9]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[14:15], v12
-; CHECK-NEXT: v_and_b32_e32 v18, 1, v12
+; CHECK-NEXT: v_and_b32_e32 v19, 1, v12
; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[8:9]|, v[14:15]
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[8:9]|, v[14:15]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
; CHECK-NEXT: v_cndmask_b32_e64 v14, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v14, v12, v14
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_and_b32_e32 v19, 0x80000000, v9
; CHECK-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc
-; CHECK-NEXT: v_and_or_b32 v14, v9, s8, v12
+; CHECK-NEXT: v_lshrrev_b64 v[14:15], 32, v[18:19]
+; CHECK-NEXT: v_or_b32_e32 v14, v12, v14
; CHECK-NEXT: v_bfe_u32 v12, v12, 16, 1
-; CHECK-NEXT: v_add3_u32 v12, v12, v14, s9
+; CHECK-NEXT: v_add3_u32 v12, v12, v14, s8
; CHECK-NEXT: v_or_b32_e32 v14, 0x400000, v14
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[8:9]
; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc
@@ -294,15 +320,17 @@ define void @v8(<8 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v8, v14, v8
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc
-; CHECK-NEXT: v_and_or_b32 v9, v11, s8, v8
-; CHECK-NEXT: v_bfe_u32 v8, v8, 16, 1
-; CHECK-NEXT: v_add3_u32 v8, v8, v9, s9
-; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT: v_and_b32_e32 v19, 0x80000000, v11
+; CHECK-NEXT: v_cndmask_b32_e32 v14, v8, v14, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[8:9], 32, v[18:19]
+; CHECK-NEXT: v_or_b32_e32 v8, v14, v8
+; CHECK-NEXT: v_bfe_u32 v9, v14, 16, 1
+; CHECK-NEXT: v_add3_u32 v9, v9, v8, s8
+; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v8
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[10:11]
-; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v10, |v[4:5]|
-; CHECK-NEXT: v_perm_b32 v12, v8, v12, s10
+; CHECK-NEXT: v_perm_b32 v12, v8, v12, s9
; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
; CHECK-NEXT: v_and_b32_e32 v11, 1, v10
; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[8:9]
@@ -311,13 +339,15 @@ define void @v8(<8 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v8, v10, v8
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
-; CHECK-NEXT: v_and_or_b32 v9, v5, s8, v8
-; CHECK-NEXT: v_bfe_u32 v8, v8, 16, 1
-; CHECK-NEXT: v_add3_u32 v8, v8, v9, s9
-; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT: v_and_b32_e32 v19, 0x80000000, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[8:9], 32, v[18:19]
+; CHECK-NEXT: v_or_b32_e32 v8, v10, v8
+; CHECK-NEXT: v_bfe_u32 v9, v10, 16, 1
+; CHECK-NEXT: v_add3_u32 v9, v9, v8, s8
+; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v8
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
-; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v9, |v[6:7]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v9
; CHECK-NEXT: v_and_b32_e32 v10, 1, v9
@@ -327,15 +357,17 @@ define void @v8(<8 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v4, v9, v4
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; CHECK-NEXT: v_and_or_b32 v5, v7, s8, v4
-; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
-; CHECK-NEXT: v_add3_u32 v4, v4, v5, s9
-; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT: v_and_b32_e32 v19, 0x80000000, v7
+; CHECK-NEXT: v_cndmask_b32_e32 v9, v4, v9, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[4:5], 32, v[18:19]
+; CHECK-NEXT: v_or_b32_e32 v4, v9, v4
+; CHECK-NEXT: v_bfe_u32 v5, v9, 16, 1
+; CHECK-NEXT: v_add3_u32 v5, v5, v4, s8
+; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
-; CHECK-NEXT: v_perm_b32 v11, v4, v8, s10
+; CHECK-NEXT: v_perm_b32 v11, v4, v8, s9
; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; CHECK-NEXT: v_and_b32_e32 v7, 1, v6
; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
@@ -344,13 +376,15 @@ define void @v8(<8 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v4, v6, v4
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; CHECK-NEXT: v_and_or_b32 v5, v1, s8, v4
-; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
-; CHECK-NEXT: v_add3_u32 v4, v4, v5, s9
-; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT: v_and_b32_e32 v19, 0x80000000, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v6, v4, v6, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[4:5], 32, v[18:19]
+; CHECK-NEXT: v_or_b32_e32 v4, v6, v4
+; CHECK-NEXT: v_bfe_u32 v5, v6, 16, 1
+; CHECK-NEXT: v_add3_u32 v5, v5, v4, s8
+; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v5
; CHECK-NEXT: v_and_b32_e32 v6, 1, v5
@@ -360,14 +394,16 @@ define void @v8(<8 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v0, v5, v0
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v3, s8, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s9
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: v_and_b32_e32 v19, 0x80000000, v3
+; CHECK-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[18:19]
+; CHECK-NEXT: v_or_b32_e32 v0, v5, v0
+; CHECK-NEXT: v_bfe_u32 v1, v5, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s8
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: v_perm_b32 v10, v0, v4, s10
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; CHECK-NEXT: v_perm_b32 v10, v0, v4, s9
; CHECK-NEXT: global_store_dwordx4 v[16:17], v[10:13], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -393,265 +429,297 @@ define void @v16(<16 x double> %num, ptr addrspace(1) %p) {
; CHECK-NEXT: v_cndmask_b32_e64 v34, -1, 1, s[6:7]
; CHECK-NEXT: v_add_u32_e32 v34, v36, v34
; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v34, v34, v36, vcc
-; CHECK-NEXT: s_brev_b32 s4, 1
-; CHECK-NEXT: v_and_or_b32 v35, v13, s4, v34
-; CHECK-NEXT: v_bfe_u32 v34, v34, 16, 1
-; CHECK-NEXT: s_movk_i32 s5, 0x7fff
-; CHECK-NEXT: v_add3_u32 v34, v34, v35, s5
+; CHECK-NEXT: v_cndmask_b32_e32 v38, v34, v36, vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v13
+; CHECK-NEXT: v_mov_b32_e32 v34, 0
+; CHECK-NEXT: v_lshrrev_b64 v[36:37], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v35, v38, v36
+; CHECK-NEXT: v_bfe_u32 v36, v38, 16, 1
+; CHECK-NEXT: s_movk_i32 s4, 0x7fff
+; CHECK-NEXT: v_add3_u32 v36, v36, v35, s4
; CHECK-NEXT: v_or_b32_e32 v35, 0x400000, v35
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[12:13]
-; CHECK-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v35, |v[14:15]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[12:13], v35
-; CHECK-NEXT: v_and_b32_e32 v36, 1, v35
+; CHECK-NEXT: v_and_b32_e32 v37, 1, v35
; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[14:15]|, v[12:13]
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[14:15]|, v[12:13]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v36
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v37
; CHECK-NEXT: v_cndmask_b32_e64 v12, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v12, v35, v12
; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v35, vcc
-; CHECK-NEXT: v_and_or_b32 v13, v15, s4, v12
-; CHECK-NEXT: v_bfe_u32 v12, v12, 16, 1
-; CHECK-NEXT: v_add3_u32 v12, v12, v13, s5
-; CHECK-NEXT: v_or_b32_e32 v13, 0x400000, v13
+; CHECK-NEXT: v_cndmask_b32_e32 v37, v12, v35, vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v15
+; CHECK-NEXT: v_lshrrev_b64 v[12:13], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v12, v37, v12
+; CHECK-NEXT: v_bfe_u32 v13, v37, 16, 1
+; CHECK-NEXT: v_add3_u32 v13, v13, v12, s4
+; CHECK-NEXT: v_or_b32_e32 v12, 0x400000, v12
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[14:15]
-; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc
-; CHECK-NEXT: s_mov_b32 s6, 0x7060302
-; CHECK-NEXT: v_perm_b32 v13, v12, v34, s6
+; CHECK-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; CHECK-NEXT: s_mov_b32 s5, 0x7060302
+; CHECK-NEXT: v_perm_b32 v13, v12, v36, s5
; CHECK-NEXT: v_cvt_f32_f64_e64 v12, |v[8:9]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[14:15], v12
-; CHECK-NEXT: v_and_b32_e32 v34, 1, v12
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[8:9]|, v[14:15]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[8:9]|, v[14:15]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v34
-; CHECK-NEXT: v_cndmask_b32_e64 v14, -1, 1, s[10:11]
+; CHECK-NEXT: v_and_b32_e32 v35, 1, v12
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[8:9]|, v[14:15]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[8:9]|, v[14:15]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v35
+; CHECK-NEXT: v_cndmask_b32_e64 v14, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v14, v12, v14
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v9
; CHECK-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc
-; CHECK-NEXT: v_and_or_b32 v14, v9, s4, v12
+; CHECK-NEXT: v_lshrrev_b64 v[14:15], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v14, v12, v14
; CHECK-NEXT: v_bfe_u32 v12, v12, 16, 1
-; CHECK-NEXT: v_add3_u32 v12, v12, v14, s5
+; CHECK-NEXT: v_add3_u32 v12, v12, v14, s4
; CHECK-NEXT: v_or_b32_e32 v14, 0x400000, v14
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[8:9]
; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v14, |v[10:11]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v14
; CHECK-NEXT: v_and_b32_e32 v15, 1, v14
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[10:11]|, v[8:9]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[10:11]|, v[8:9]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[10:11]|, v[8:9]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[10:11]|, v[8:9]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
-; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v8, v14, v8
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc
-; CHECK-NEXT: v_and_or_b32 v9, v11, s4, v8
-; CHECK-NEXT: v_bfe_u32 v8, v8, 16, 1
-; CHECK-NEXT: v_add3_u32 v8, v8, v9, s5
-; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v11
+; CHECK-NEXT: v_cndmask_b32_e32 v14, v8, v14, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[8:9], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v8, v14, v8
+; CHECK-NEXT: v_bfe_u32 v9, v14, 16, 1
+; CHECK-NEXT: v_add3_u32 v9, v9, v8, s4
+; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v8
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[10:11]
-; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v10, |v[4:5]|
-; CHECK-NEXT: v_perm_b32 v12, v8, v12, s6
+; CHECK-NEXT: v_perm_b32 v12, v8, v12, s5
; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
; CHECK-NEXT: v_and_b32_e32 v11, 1, v10
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[4:5]|, v[8:9]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[4:5]|, v[8:9]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[4:5]|, v[8:9]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[4:5]|, v[8:9]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v8, v10, v8
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
-; CHECK-NEXT: v_and_or_b32 v9, v5, s4, v8
-; CHECK-NEXT: v_bfe_u32 v8, v8, 16, 1
-; CHECK-NEXT: v_add3_u32 v8, v8, v9, s5
-; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[8:9], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v8, v10, v8
+; CHECK-NEXT: v_bfe_u32 v9, v10, 16, 1
+; CHECK-NEXT: v_add3_u32 v9, v9, v8, s4
+; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v8
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
-; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v9, |v[6:7]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v9
; CHECK-NEXT: v_and_b32_e32 v10, 1, v9
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[6:7]|, v[4:5]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[6:7]|, v[4:5]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[6:7]|, v[4:5]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[6:7]|, v[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v4, v9, v4
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; CHECK-NEXT: v_and_or_b32 v5, v7, s4, v4
-; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
-; CHECK-NEXT: v_add3_u32 v4, v4, v5, s5
-; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v7
+; CHECK-NEXT: v_cndmask_b32_e32 v9, v4, v9, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[4:5], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v4, v9, v4
+; CHECK-NEXT: v_bfe_u32 v5, v9, 16, 1
+; CHECK-NEXT: v_add3_u32 v5, v5, v4, s4
+; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
-; CHECK-NEXT: v_perm_b32 v11, v4, v8, s6
+; CHECK-NEXT: v_perm_b32 v11, v4, v8, s5
; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; CHECK-NEXT: v_and_b32_e32 v7, 1, v6
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[0:1]|, v[4:5]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[0:1]|, v[4:5]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[0:1]|, v[4:5]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[0:1]|, v[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v4, v6, v4
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; CHECK-NEXT: v_and_or_b32 v5, v1, s4, v4
-; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
-; CHECK-NEXT: v_add3_u32 v4, v4, v5, s5
-; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v6, v4, v6, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[4:5], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v4, v6, v4
+; CHECK-NEXT: v_bfe_u32 v5, v6, 16, 1
+; CHECK-NEXT: v_add3_u32 v5, v5, v4, s4
+; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v5
; CHECK-NEXT: v_and_b32_e32 v6, 1, v5
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[2:3]|, v[0:1]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[2:3]|, v[0:1]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[2:3]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[2:3]|, v[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v0, v5, v0
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v3, s4, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s5
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v3
+; CHECK-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v0, v5, v0
+; CHECK-NEXT: v_bfe_u32 v1, v5, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v2, |v[28:29]|
-; CHECK-NEXT: v_perm_b32 v10, v0, v4, s6
+; CHECK-NEXT: v_perm_b32 v10, v0, v4, s5
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; CHECK-NEXT: v_and_b32_e32 v3, 1, v2
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[28:29]|, v[0:1]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[28:29]|, v[0:1]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[28:29]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[28:29]|, v[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v0, v2, v0
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v29, s4, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s5
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v29
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v0, v2, v0
+; CHECK-NEXT: v_bfe_u32 v1, v2, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[28:29]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_f32_f64_e64 v3, |v[30:31]|
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
; CHECK-NEXT: v_and_b32_e32 v4, 1, v3
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[30:31]|, v[0:1]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[30:31]|, v[0:1]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[30:31]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[30:31]|, v[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v0, v3, v0
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v31, s4, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s5
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v31
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v0, v3, v0
+; CHECK-NEXT: v_bfe_u32 v1, v3, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[30:31]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: v_perm_b32 v3, v0, v2, s6
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; CHECK-NEXT: v_perm_b32 v3, v0, v2, s5
; CHECK-NEXT: v_cvt_f32_f64_e64 v2, |v[24:25]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; CHECK-NEXT: v_and_b32_e32 v4, 1, v2
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[24:25]|, v[0:1]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[24:25]|, v[0:1]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[24:25]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[24:25]|, v[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v0, v2, v0
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v25, s4, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s5
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v25
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v0, v2, v0
+; CHECK-NEXT: v_bfe_u32 v1, v2, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[24:25]
; CHECK-NEXT: v_cvt_f32_f64_e64 v4, |v[26:27]|
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
; CHECK-NEXT: v_and_b32_e32 v5, 1, v4
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[26:27]|, v[0:1]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[26:27]|, v[0:1]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[26:27]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[26:27]|, v[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v0, v4, v0
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v27, s4, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s5
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v27
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v0, v4, v0
+; CHECK-NEXT: v_bfe_u32 v1, v4, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[26:27]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CHECK-NEXT: v_cvt_f32_f64_e64 v4, |v[20:21]|
-; CHECK-NEXT: v_perm_b32 v2, v0, v2, s6
+; CHECK-NEXT: v_perm_b32 v2, v0, v2, s5
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
; CHECK-NEXT: v_and_b32_e32 v5, 1, v4
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[20:21]|, v[0:1]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[20:21]|, v[0:1]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[20:21]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[20:21]|, v[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v0, v4, v0
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v21, s4, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s5
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v21
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v0, v4, v0
+; CHECK-NEXT: v_bfe_u32 v1, v4, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[20:21]
; CHECK-NEXT: v_cvt_f32_f64_e64 v5, |v[22:23]|
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v5
; CHECK-NEXT: v_and_b32_e32 v6, 1, v5
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[22:23]|, v[0:1]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[22:23]|, v[0:1]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[22:23]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[22:23]|, v[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v0, v5, v0
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; CHECK-NEXT: v_and_or_b32 v1, v23, s4, v0
-; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v1, s5
-; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v23
+; CHECK-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v0, v5, v0
+; CHECK-NEXT: v_bfe_u32 v1, v5, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4
+; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[22:23]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: v_perm_b32 v1, v0, v4, s6
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; CHECK-NEXT: v_perm_b32 v1, v0, v4, s5
; CHECK-NEXT: v_cvt_f32_f64_e64 v0, |v[16:17]|
; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v0
; CHECK-NEXT: v_and_b32_e32 v6, 1, v0
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[16:17]|, v[4:5]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[16:17]|, v[4:5]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[16:17]|, v[4:5]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[16:17]|, v[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v4, v0, v4
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v17
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT: v_and_or_b32 v4, v17, s4, v0
+; CHECK-NEXT: v_lshrrev_b64 v[4:5], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v4, v0, v4
; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
-; CHECK-NEXT: v_add3_u32 v0, v0, v4, s5
+; CHECK-NEXT: v_add3_u32 v0, v0, v4, s4
; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[16:17]
; CHECK-NEXT: v_cvt_f32_f64_e64 v6, |v[18:19]|
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; CHECK-NEXT: v_and_b32_e32 v7, 1, v6
-; CHECK-NEXT: v_cmp_gt_f64_e64 s[10:11], |v[18:19]|, v[4:5]
-; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], |v[18:19]|, v[4:5]
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[18:19]|, v[4:5]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[18:19]|, v[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[8:9]
; CHECK-NEXT: v_add_u32_e32 v4, v6, v4
-; CHECK-NEXT: s_or_b64 vcc, s[8:9], vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; CHECK-NEXT: v_and_or_b32 v5, v19, s4, v4
-; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
-; CHECK-NEXT: v_add3_u32 v4, v4, v5, s5
-; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT: s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT: v_and_b32_e32 v35, 0x80000000, v19
+; CHECK-NEXT: v_cndmask_b32_e32 v6, v4, v6, vcc
+; CHECK-NEXT: v_lshrrev_b64 v[4:5], 32, v[34:35]
+; CHECK-NEXT: v_or_b32_e32 v4, v6, v4
+; CHECK-NEXT: v_bfe_u32 v5, v6, 16, 1
+; CHECK-NEXT: v_add3_u32 v5, v5, v4, s4
+; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4
; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[18:19]
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT: v_perm_b32 v0, v4, v0, s6
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; CHECK-NEXT: v_perm_b32 v0, v4, v0, s5
; CHECK-NEXT: global_store_dwordx4 v[32:33], v[0:3], off offset:16
; CHECK-NEXT: global_store_dwordx4 v[32:33], v[10:13], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 3118d637880425..01bce150c95c2c 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -797,7 +797,7 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v3
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_bfe_u32 v1, v1, 0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v2i16:
@@ -1013,7 +1013,7 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_bfe_u32 v3, v3, 0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 401cbce00ac9a8..c61a438bc4d595 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -253,15 +253,30 @@ define i48 @i48_func_void() #0 {
}
define zeroext i48 @i48_zeroext_func_void() #0 {
-; GFX789-LABEL: i48_zeroext_func_void:
-; GFX789: ; %bb.0:
-; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_mov_b32 s7, 0xf000
-; GFX789-NEXT: s_mov_b32 s6, -1
-; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; GFX789-NEXT: buffer_load_ushort v1, off, s[4:7], 0
-; GFX789-NEXT: s_waitcnt vmcnt(0)
-; GFX789-NEXT: s_setpc_b64 s[30:31]
+; CI-LABEL: i48_zeroext_func_void:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 s7, 0xf000
+; CI-NEXT: s_mov_b32 s6, -1
+; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: i48_zeroext_func_void:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; GFX89-NEXT: v_mov_b32_e32 v1, 0
+; GFX89-NEXT: s_waitcnt vmcnt(1)
+; GFX89-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX89-NEXT: v_lshlrev_b64 v[0:1], 32, v[0:1]
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i48_zeroext_func_void:
; GFX11: ; %bb.0:
@@ -269,24 +284,44 @@ define zeroext i48 @i48_zeroext_func_void() #0 {
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v2, off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], 32, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i48, ptr addrspace(1) undef, align 8
ret i48 %val
}
define signext i48 @i48_signext_func_void() #0 {
-; GFX789-LABEL: i48_signext_func_void:
-; GFX789: ; %bb.0:
-; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_mov_b32 s7, 0xf000
-; GFX789-NEXT: s_mov_b32 s6, -1
-; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; GFX789-NEXT: buffer_load_sshort v1, off, s[4:7], 0
-; GFX789-NEXT: s_waitcnt vmcnt(0)
-; GFX789-NEXT: s_setpc_b64 s[30:31]
+; CI-LABEL: i48_signext_func_void:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 s7, 0xf000
+; CI-NEXT: s_mov_b32 s6, -1
+; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; CI-NEXT: buffer_load_sshort v1, off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: i48_signext_func_void:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(1)
+; GFX89-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX89-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX89-NEXT: v_lshlrev_b64 v[0:1], 32, v[0:1]
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i48_signext_func_void:
; GFX11: ; %bb.0:
@@ -294,9 +329,16 @@ define signext i48 @i48_signext_func_void() #0 {
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT: buffer_load_i16 v1, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v2, off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], 32, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i48, ptr addrspace(1) undef, align 8
ret i48 %val
@@ -2371,10 +2413,11 @@ define <2 x bfloat> @v2bf16_func_void() #0 {
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: v2bf16_func_void:
@@ -2407,8 +2450,9 @@ define <3 x bfloat> @v3bf16_func_void() #0 {
; CI-NEXT: buffer_load_dwordx2 v[1:2], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: v3bf16_func_void:
@@ -2438,12 +2482,14 @@ define <4 x bfloat> @v4bf16_func_void() #0 {
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: v4bf16_func_void:
@@ -2473,14 +2519,17 @@ define <6 x bfloat> @v6bf16_func_void() #0 {
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: buffer_load_dwordx3 v[3:5], off, s[4:7], 0
+; CI-NEXT: buffer_load_dwordx3 v[5:7], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: v6bf16_func_void:
@@ -2510,16 +2559,20 @@ define <8 x bfloat> @v8bf16_func_void() #0 {
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT: buffer_load_dwordx4 v[7:10], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; CI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v10
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: v8bf16_func_void:
@@ -2549,23 +2602,27 @@ define <16 x bfloat> @v16bf16_func_void() #0 {
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT: buffer_load_dwordx4 v[7:10], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; CI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v10
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; CI-NEXT: v_mov_b32_e32 v8, v0
-; CI-NEXT: v_mov_b32_e32 v9, v1
; CI-NEXT: v_mov_b32_e32 v10, v2
-; CI-NEXT: v_mov_b32_e32 v11, v3
; CI-NEXT: v_mov_b32_e32 v12, v4
-; CI-NEXT: v_mov_b32_e32 v13, v5
; CI-NEXT: v_mov_b32_e32 v14, v6
+; CI-NEXT: v_mov_b32_e32 v9, v1
+; CI-NEXT: v_mov_b32_e32 v11, v3
+; CI-NEXT: v_mov_b32_e32 v13, v5
; CI-NEXT: v_mov_b32_e32 v15, v7
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2602,37 +2659,41 @@ define <32 x bfloat> @v32bf16_func_void() #0 {
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT: buffer_load_dwordx4 v[7:10], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; CI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v10
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; CI-NEXT: v_mov_b32_e32 v8, v0
-; CI-NEXT: v_mov_b32_e32 v9, v1
; CI-NEXT: v_mov_b32_e32 v10, v2
-; CI-NEXT: v_mov_b32_e32 v11, v3
; CI-NEXT: v_mov_b32_e32 v12, v4
-; CI-NEXT: v_mov_b32_e32 v13, v5
; CI-NEXT: v_mov_b32_e32 v14, v6
; CI-NEXT: v_mov_b32_e32 v16, v0
-; CI-NEXT: v_mov_b32_e32 v17, v1
; CI-NEXT: v_mov_b32_e32 v18, v2
-; CI-NEXT: v_mov_b32_e32 v19, v3
; CI-NEXT: v_mov_b32_e32 v20, v4
-; CI-NEXT: v_mov_b32_e32 v21, v5
+; CI-NEXT: v_mov_b32_e32 v22, v6
; CI-NEXT: v_mov_b32_e32 v24, v0
-; CI-NEXT: v_mov_b32_e32 v25, v1
; CI-NEXT: v_mov_b32_e32 v26, v2
-; CI-NEXT: v_mov_b32_e32 v27, v3
; CI-NEXT: v_mov_b32_e32 v28, v4
-; CI-NEXT: v_mov_b32_e32 v29, v5
-; CI-NEXT: v_mov_b32_e32 v22, v6
; CI-NEXT: v_mov_b32_e32 v30, v6
+; CI-NEXT: v_mov_b32_e32 v9, v1
+; CI-NEXT: v_mov_b32_e32 v11, v3
+; CI-NEXT: v_mov_b32_e32 v13, v5
+; CI-NEXT: v_mov_b32_e32 v17, v1
+; CI-NEXT: v_mov_b32_e32 v19, v3
+; CI-NEXT: v_mov_b32_e32 v25, v1
+; CI-NEXT: v_mov_b32_e32 v27, v3
+; CI-NEXT: v_mov_b32_e32 v21, v5
+; CI-NEXT: v_mov_b32_e32 v29, v5
; CI-NEXT: v_mov_b32_e32 v15, v7
; CI-NEXT: v_mov_b32_e32 v23, v7
; CI-NEXT: v_mov_b32_e32 v31, v7
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index d10e049444d685..e6cac7aa7b05a4 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -2169,39 +2169,41 @@ define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_dword v3, v[0:1], off
-; GFX900-NEXT: s_mov_b64 s[6:7], 0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-NEXT: s_movk_i32 s8, 0x7fff
-; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX900-NEXT: s_mov_b32 s9, 0x7060302
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: s_mov_b64 s[4:5], 0
+; GFX900-NEXT: s_movk_i32 s6, 0x7fff
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: s_mov_b32 s7, 0x7060302
; GFX900-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v6, v3
; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v6
; GFX900-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX900-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v5, v5, v2
; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX900-NEXT: v_add3_u32 v7, v7, v3, s6
+; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX900-NEXT: v_add3_u32 v7, v7, v3, s8
-; GFX900-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX900-NEXT: v_add3_u32 v7, v9, v5, s6
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX900-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc
+; GFX900-NEXT: v_perm_b32 v5, v5, v3, s7
; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
-; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX900-NEXT: s_cbranch_execnz .LBB14_1
; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX900-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX900-NEXT: v_mov_b32_e32 v0, v3
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2209,39 +2211,41 @@ define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_mov_b32 s7, 0x7060302
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v3
; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, 16, v6
; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
-; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX908-NEXT: v_add3_u32 v7, v9, v5, s6
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s7
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1_vol
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
-; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
@@ -2249,39 +2253,41 @@ define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_mov_b32 s7, 0x7060302
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, 16, v7
; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s6
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc
+; GFX90A-NEXT: v_add3_u32 v6, v9, v5, s6
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s7
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2289,39 +2295,41 @@ define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
-; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v3, v5, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB14_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -2329,42 +2337,44 @@ define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo
+; GFX11-NEXT: v_perm_b32 v5, v3, v5, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB14_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
@@ -2376,144 +2386,152 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat>
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_dword v3, v[0:1], off
-; GFX900-NEXT: s_mov_b64 s[6:7], 0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-NEXT: s_movk_i32 s8, 0x7fff
-; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX900-NEXT: s_mov_b32 s9, 0x7060302
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: s_mov_b64 s[4:5], 0
+; GFX900-NEXT: s_movk_i32 s6, 0x7fff
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT: s_mov_b32 s7, 0x7060302
; GFX900-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GFX900-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX900-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX900-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v6, v6, v5
; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX900-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX900-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX900-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX900-NEXT: v_add3_u32 v7, v9, v6, s6
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX900-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX900-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc
+; GFX900-NEXT: v_perm_b32 v2, v6, v2, s7
; GFX900-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX900-NEXT: v_mov_b32_e32 v3, v2
-; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX900-NEXT: s_cbranch_execnz .LBB15_1
; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX900-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_atomic_fadd_noret_v2bf16:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX908-NEXT: s_mov_b32 s7, 0x7060302
; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT: v_add3_u32 v7, v9, v6, s6
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s7
; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1_vol
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: v_mov_b32_e32 v3, v2
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_atomic_fadd_noret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX90A-NEXT: s_mov_b32 s7, 0x7060302
; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_add3_u32 v7, v9, v6, s6
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s7
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_atomic_fadd_noret_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v6
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
-; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v7, v7, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX10-NEXT: v_add3_u32 v8, v8, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
+; GFX10-NEXT: v_perm_b32 v2, v2, v6, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2521,40 +2539,42 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat>
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB15_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_atomic_fadd_noret_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-NEXT: v_add3_u32 v7, v7, v6, 0x7fff
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-NEXT: v_add3_u32 v8, v8, v2, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
+; GFX11-NEXT: v_perm_b32 v2, v2, v6, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2562,12 +2582,12 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat>
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index d9cbbc11f9a738..d78cb26994150a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -2026,32 +2026,67 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(ptr a
; Range is sufficiently restricted to push the shift into 32-bits.
define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
-; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range:
-; GCN: ; %bb.0:
-; GCN-NEXT: global_load_dword v0, v[0:1], off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: global_load_dword v0, v0, s[2:3]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_b32 v0, v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_f32_saddr_zext_vgpr_range:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, s2, v0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_f32_saddr_zext_vgpr_range:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
%voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{}
%zext.offset = zext i32 %voffset to i64
%gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2061,32 +2096,67 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) i
; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
-; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
-; GCN: ; %bb.0:
-; GCN-NEXT: global_load_dword v0, v[0:1], off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:400
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:400
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:400
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:400
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_b32 v0, v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:400
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, s2, v0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off offset:400
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] offset:400
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
%voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{}
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index fdd913867c8f89..1794ff9b47c6eb 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -967,34 +967,38 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_ushort v5, off, s[0:3], 0
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
+; GFX7-NEXT: v_mov_b32_e32 v2, v1
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8
-; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8
+; GFX7-NEXT: v_bfe_i32 v1, v3, 16, 8
+; GFX7-NEXT: v_ashrrev_i32_e32 v7, 24, v3
+; GFX7-NEXT: v_bfe_i32 v6, v3, 0, 8
+; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v7
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8
-; GFX7-NEXT: v_ashrrev_i32_e32 v5, 24, v2
-; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8
-; GFX7-NEXT: v_ashrrev_i32_e32 v8, 24, v0
-; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT: v_ashrrev_i32_e32 v10, 24, v4
+; GFX7-NEXT: v_bfe_i32 v9, v4, 0, 8
+; GFX7-NEXT: v_bfe_u32 v11, v3, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v10
+; GFX7-NEXT: v_lshr_b64 v[0:1], v[0:1], 48
+; GFX7-NEXT: v_bfe_i32 v7, v4, 16, 8
+; GFX7-NEXT: v_bfe_i32 v4, v4, 8, 8
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX7-NEXT: v_lshr_b64 v[1:2], v[2:3], 48
+; GFX7-NEXT: v_bfe_u32 v4, v4, 0, 16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v2, v6, v9, v5
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX7-NEXT: v_mad_u32_u24 v2, v11, v4, v2
+; GFX7-NEXT: v_mad_u32_u24 v2, v8, v7, v2
+; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 9a1de74034cd83..1d9bc7a751c18a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -2184,24 +2184,26 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v2
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0
-; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8
-; GFX7-NEXT: v_alignbit_b32 v0, v6, v0, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0
+; GFX7-NEXT: v_alignbit_b32 v2, v4, v2, 16
+; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 16
+; GFX7-NEXT: v_alignbit_b32 v0, v7, v0, 16
+; GFX7-NEXT: v_bfe_u32 v6, v6, 8, 16
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v4, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v5, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v7, v4, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 8c53d2671de3f6..f3a9a93fc9fbfe 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -32,31 +32,50 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v2
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
-; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v15, 15, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 20, v2
+; GFX7-NEXT: v_bfe_u32 v14, v0, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xf0000000, v3
+; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xf0000000, v4
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v3
+; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
-; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
-; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1
-; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v15, s4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 12, v2
+; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xf0000000, v5
+; GFX7-NEXT: v_ashrrev_i32_e32 v4, 28, v4
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v3, v14, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v2
+; GFX7-NEXT: v_bfe_u32 v11, v0, 16, 4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xf0000000, v6
+; GFX7-NEXT: v_ashrrev_i32_e32 v5, 28, v5
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v4, v13, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 4, v2
+; GFX7-NEXT: v_bfe_u32 v10, v0, 20, 4
+; GFX7-NEXT: v_and_b32_e32 v7, 0xf0000000, v7
+; GFX7-NEXT: v_ashrrev_i32_e32 v6, 28, v6
+; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 4
; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1
-; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4
-; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1
-; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4
-; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1
+; GFX7-NEXT: v_bfe_u32 v9, v0, 24, 4
+; GFX7-NEXT: v_and_b32_e32 v8, 0xf0000000, v8
+; GFX7-NEXT: v_ashrrev_i32_e32 v7, 28, v7
+; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v6, v11, v1
+; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v8
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v7, v10, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1
+; GFX7-NEXT: v_mad_i32_i24 v1, v8, v9, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -69,13 +88,13 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX8-NEXT: s_mov_b32 s10, -1
@@ -83,32 +102,51 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
-; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
-; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4
-; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4
-; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4
-; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4
+; GFX8-NEXT: v_and_b32_e32 v8, 15, v3
+; GFX8-NEXT: v_bfe_u32 v7, v3, 4, 4
+; GFX8-NEXT: v_bfe_i32 v8, v8, 0, 4
+; GFX8-NEXT: v_bfe_u32 v6, v3, 8, 4
+; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 4
+; GFX8-NEXT: v_bfe_u32 v5, v3, 12, 4
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
-; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v0
+; GFX8-NEXT: v_bfe_i32 v9, v0, 0, 4
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 20, v0
+; GFX8-NEXT: v_and_b32_e32 v10, 0xf0000000, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX8-NEXT: v_ashrrev_i32_e32 v10, 28, v10
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
-; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
-; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
-; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1
-; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1
-; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4
-; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3
+; GFX8-NEXT: v_mad_i32_i24 v8, v9, v8, s2
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 12, v0
+; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 4
+; GFX8-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX8-NEXT: v_ashrrev_i32_e32 v11, 28, v11
+; GFX8-NEXT: v_mad_i32_i24 v7, v10, v7, v8
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 4
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 8, v0
+; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 4
+; GFX8-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX8-NEXT: v_ashrrev_i32_e32 v12, 28, v12
+; GFX8-NEXT: v_mad_i32_i24 v6, v11, v6, v7
+; GFX8-NEXT: v_bfe_u32 v2, v3, 20, 4
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 4, v0
+; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 4
+; GFX8-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX8-NEXT: v_ashrrev_i32_e32 v13, 28, v13
+; GFX8-NEXT: v_mad_i32_i24 v5, v12, v5, v6
+; GFX8-NEXT: v_bfe_u32 v1, v3, 24, 4
+; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 4
+; GFX8-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX8-NEXT: v_ashrrev_i32_e32 v14, 28, v14
+; GFX8-NEXT: v_mad_i32_i24 v4, v13, v4, v5
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 4
+; GFX8-NEXT: v_ashrrev_i32_e32 v15, 28, v15
+; GFX8-NEXT: v_mad_i32_i24 v2, v14, v2, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3
+; GFX8-NEXT: v_mad_i32_i24 v1, v15, v1, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, v0, v3, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -126,41 +164,60 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_addc_u32 s9, s9, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4
+; GFX9-NEXT: v_and_b32_e32 v9, 15, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4
-; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4
-; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4
-; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4
-; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4
-; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4
-; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4
-; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4
-; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4
-; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4
-; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4
-; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4
-; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v2
+; GFX9-NEXT: v_bfe_u32 v8, v1, 4, 4
+; GFX9-NEXT: v_bfe_i32 v10, v2, 0, 4
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v2
+; GFX9-NEXT: v_bfe_i32 v9, v9, 0, 4
+; GFX9-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4
+; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 12, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v2
+; GFX9-NEXT: v_bfe_i32 v8, v8, 0, 4
+; GFX9-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX9-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX9-NEXT: v_mul_i32_i24_e32 v9, v10, v9
+; GFX9-NEXT: v_ashrrev_i32_e32 v10, 28, v11
+; GFX9-NEXT: v_bfe_u32 v3, v1, 24, 4
+; GFX9-NEXT: v_bfe_u32 v4, v1, 20, 4
+; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 4
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 4, v2
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2
-; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v4
-; GFX9-NEXT: v_mul_i32_i24_e32 v4, v5, v6
-; GFX9-NEXT: v_mul_i32_i24_e32 v5, v7, v8
-; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10
-; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX9-NEXT: v_bfe_i32 v7, v7, 0, 4
+; GFX9-NEXT: v_bfe_i32 v6, v6, 0, 4
+; GFX9-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX9-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 28, v12
+; GFX9-NEXT: v_ashrrev_i32_e32 v12, 28, v13
+; GFX9-NEXT: v_mul_i32_i24_e32 v8, v10, v8
+; GFX9-NEXT: v_bfe_i32 v5, v5, 0, 4
+; GFX9-NEXT: v_bfe_i32 v4, v4, 0, 4
+; GFX9-NEXT: v_and_b32_e32 v16, 0xf0000000, v16
+; GFX9-NEXT: v_ashrrev_i32_e32 v13, 28, v14
+; GFX9-NEXT: v_ashrrev_i32_e32 v14, 28, v15
+; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v7
+; GFX9-NEXT: v_mul_i32_i24_e32 v6, v12, v6
+; GFX9-NEXT: v_mul_i32_i24_e32 v1, v2, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_add3_u32 v2, v3, s0, v4
-; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v12
-; GFX9-NEXT: v_mul_i32_i24_e32 v8, v13, v14
-; GFX9-NEXT: v_add3_u32 v2, v2, v5, v6
-; GFX9-NEXT: v_mul_i32_i24_e32 v9, v15, v16
-; GFX9-NEXT: v_add3_u32 v2, v2, v7, v8
-; GFX9-NEXT: v_add3_u32 v1, v2, v9, v1
+; GFX9-NEXT: v_add3_u32 v2, v9, s0, v8
+; GFX9-NEXT: v_bfe_i32 v3, v3, 0, 4
+; GFX9-NEXT: v_ashrrev_i32_e32 v15, 28, v16
+; GFX9-NEXT: v_mul_i32_i24_e32 v5, v13, v5
+; GFX9-NEXT: v_mul_i32_i24_e32 v4, v14, v4
+; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6
+; GFX9-NEXT: v_mul_i32_i24_e32 v3, v15, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4
+; GFX9-NEXT: v_add3_u32 v1, v2, v3, v1
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
@@ -176,12 +233,60 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v1
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v11, 24, v2
+; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 4, 4
+; GFX9-DL-NEXT: v_bfe_i32 v10, v2, 0, 4
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v12, 20, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v13, 16, v2
+; GFX9-DL-NEXT: v_bfe_i32 v9, v9, 0, 4
+; GFX9-DL-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
+; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v14, 12, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v15, 8, v2
+; GFX9-DL-NEXT: v_bfe_i32 v8, v8, 0, 4
+; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX9-DL-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v9, v10, v9
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v10, 28, v11
+; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 24, 4
+; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 20, 4
+; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 16, 4
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v16, 4, v2
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX9-DL-NEXT: v_bfe_i32 v7, v7, 0, 4
+; GFX9-DL-NEXT: v_bfe_i32 v6, v6, 0, 4
+; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX9-DL-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v11, 28, v12
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v12, 28, v13
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v8, v10, v8
+; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 4
+; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 4
+; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xf0000000, v16
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v13, 28, v14
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v14, 28, v15
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v7, v11, v7
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v12, v6
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v2, v1
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: v_add3_u32 v2, v9, s0, v8
+; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 4
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v15, 28, v16
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v13, v5
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v14, v4
+; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, v15, v3
+; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4
+; GFX9-DL-NEXT: v_add3_u32 v1, v2, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
@@ -198,13 +303,61 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
-; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
-; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, 15, v1
+; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v10, 24, v2
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 4, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 0, 4
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v11, 20, v2
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v8, v8, 0, 4
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, 0xf0000000, v10
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v5, v1, 12, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v6, v1, 8, 4
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v13, 12, v2
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v14, 8, v2
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v7, 0, 4
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v9, v8
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v9, 28, v10
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v3, v1, 20, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v4, v1, 16, 4
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v15, 4, v2
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v6, 0, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v5, 0, 4
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v10, 28, v11
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v11, 28, v12
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v7, v9, v7
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v0, v1, 24, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v4, 0, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v3, 0, 4
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v12, 28, v13
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v13, 28, v14
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v10, v6
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 0xf0000000, v15
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v5, v11, v5
+; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v7, v8, s2, v7
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v12, v4
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v13, v3
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v8, 28, v9
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v0, 0, 4
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v5, v7, v6, v5
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v0, v8, v0
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v2, v1
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v2, v5, v4, v3
+; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v2, v0, v1
+; GFX10-DL-XNACK-NEXT: global_store_dword v3, v0, s[0:1]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc32:
@@ -213,7 +366,6 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
@@ -221,12 +373,61 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
-; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, 15, v1
+; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 24, v0
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 4, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v11, 20, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v8, v8, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, 0xf0000000, v10
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v5, v1, 12, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v6, v1, 8, 4
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v13, 12, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v14, 8, v0
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v7, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v9, v8
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v9, 28, v10
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v3, v1, 20, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v4, v1, 16, 4
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v15, 4, v0
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v6, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v5, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v10, 28, v11
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v11, 28, v12
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v7, v9, v7
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v2, v1, 24, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v4, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v3, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v12, 28, v13
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v13, 28, v14
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v10, v6
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 0xf0000000, v15
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v5, v11, v5
+; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v7, v8, s2, v7
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v12, v4
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v13, v3
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v8, 28, v9
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v2, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v5, v7, v6, v5
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v2, v8, v2
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v0, v1
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v5, v4, v3
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v2, v0
+; GFX10-DL-NOXNACK-NEXT: global_store_dword v3, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1369,31 +1570,50 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
+; GFX7-NEXT: v_and_b32_e32 v15, 15, v0
+; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 20, v2
+; GFX7-NEXT: v_bfe_u32 v14, v0, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xf0000000, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4
-; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16
-; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
-; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
-; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1
-; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
+; GFX7-NEXT: v_mad_i32_i24 v16, v1, v15, s4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xf0000000, v4
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v3
+; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v15, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 12, v2
+; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xf0000000, v5
+; GFX7-NEXT: v_ashrrev_i32_e32 v4, 28, v4
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v3, v14, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v2
+; GFX7-NEXT: v_bfe_u32 v11, v0, 16, 4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xf0000000, v6
+; GFX7-NEXT: v_ashrrev_i32_e32 v5, 28, v5
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v4, v13, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 4, v2
+; GFX7-NEXT: v_bfe_u32 v10, v0, 20, 4
+; GFX7-NEXT: v_and_b32_e32 v7, 0xf0000000, v7
+; GFX7-NEXT: v_ashrrev_i32_e32 v6, 28, v6
+; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 4
; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1
-; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4
-; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1
-; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4
-; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1
+; GFX7-NEXT: v_bfe_u32 v9, v0, 24, 4
+; GFX7-NEXT: v_and_b32_e32 v8, 0xf0000000, v8
+; GFX7-NEXT: v_ashrrev_i32_e32 v7, 28, v7
+; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v6, v11, v1
+; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v8
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v7, v10, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1
+; GFX7-NEXT: v_mad_i32_i24 v1, v8, v9, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v16, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1407,13 +1627,13 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX8-NEXT: s_mov_b32 s10, -1
@@ -1421,33 +1641,52 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
-; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
-; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4
-; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4
-; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4
-; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4
+; GFX8-NEXT: v_and_b32_e32 v8, 15, v3
+; GFX8-NEXT: v_bfe_i32 v8, v8, 0, 4
+; GFX8-NEXT: v_bfe_u32 v7, v3, 4, 4
+; GFX8-NEXT: v_bfe_u32 v6, v3, 8, 4
+; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 4
+; GFX8-NEXT: v_bfe_u32 v5, v3, 12, 4
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
+; GFX8-NEXT: v_bfe_i32 v9, v0, 0, 4
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 20, v0
+; GFX8-NEXT: v_and_b32_e32 v10, 0xf0000000, v10
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s2
-; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16
-; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
-; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
-; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1
-; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1
-; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4
-; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3
+; GFX8-NEXT: v_mad_i32_i24 v16, v9, v8, s2
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX8-NEXT: v_ashrrev_i32_e32 v10, 28, v10
+; GFX8-NEXT: v_mad_i32_i24 v8, v9, v8, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 12, v0
+; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 4
+; GFX8-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX8-NEXT: v_ashrrev_i32_e32 v11, 28, v11
+; GFX8-NEXT: v_mad_i32_i24 v7, v10, v7, v8
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 4
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 8, v0
+; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 4
+; GFX8-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX8-NEXT: v_ashrrev_i32_e32 v12, 28, v12
+; GFX8-NEXT: v_mad_i32_i24 v6, v11, v6, v7
+; GFX8-NEXT: v_bfe_u32 v2, v3, 20, 4
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 4, v0
+; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 4
+; GFX8-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX8-NEXT: v_ashrrev_i32_e32 v13, 28, v13
+; GFX8-NEXT: v_mad_i32_i24 v5, v12, v5, v6
+; GFX8-NEXT: v_bfe_u32 v1, v3, 24, 4
+; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 4
+; GFX8-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX8-NEXT: v_ashrrev_i32_e32 v14, 28, v14
+; GFX8-NEXT: v_mad_i32_i24 v4, v13, v4, v5
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 4
+; GFX8-NEXT: v_ashrrev_i32_e32 v15, 28, v15
+; GFX8-NEXT: v_mad_i32_i24 v2, v14, v2, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
-; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3
+; GFX8-NEXT: v_mad_i32_i24 v1, v15, v1, v2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1466,42 +1705,61 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_addc_u32 s9, s9, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4
+; GFX9-NEXT: v_and_b32_e32 v9, 15, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4
-; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4
-; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4
-; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4
-; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4
-; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4
-; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4
-; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4
-; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4
-; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4
-; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4
-; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4
-; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 12, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 4, v2
+; GFX9-NEXT: v_bfe_u32 v3, v1, 24, 4
+; GFX9-NEXT: v_bfe_u32 v4, v1, 20, 4
+; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 4
+; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4
+; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4
+; GFX9-NEXT: v_bfe_u32 v8, v1, 4, 4
+; GFX9-NEXT: v_bfe_i32 v10, v2, 0, 4
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2
-; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX9-NEXT: v_bfe_i32 v9, v9, 0, 4
+; GFX9-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX9-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX9-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX9-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX9-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX9-NEXT: v_and_b32_e32 v16, 0xf0000000, v16
+; GFX9-NEXT: v_bfe_i32 v8, v8, 0, 4
+; GFX9-NEXT: v_bfe_i32 v7, v7, 0, 4
+; GFX9-NEXT: v_mul_i32_i24_e32 v1, v2, v1
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v11
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 28, v12
+; GFX9-NEXT: v_ashrrev_i32_e32 v12, 28, v13
+; GFX9-NEXT: v_ashrrev_i32_e32 v13, 28, v14
+; GFX9-NEXT: v_ashrrev_i32_e32 v14, 28, v15
+; GFX9-NEXT: v_ashrrev_i32_e32 v15, 28, v16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s0
-; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v6
-; GFX9-NEXT: v_mul_i32_i24_e32 v6, v7, v8
-; GFX9-NEXT: v_mad_i32_i24 v3, v3, v4, v2
-; GFX9-NEXT: v_mul_i32_i24_e32 v7, v9, v10
-; GFX9-NEXT: v_mul_i32_i24_e32 v8, v11, v12
-; GFX9-NEXT: v_add3_u32 v3, v3, v5, v6
-; GFX9-NEXT: v_mul_i32_i24_e32 v9, v13, v14
-; GFX9-NEXT: v_mul_i32_i24_e32 v10, v15, v16
-; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8
-; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10
-; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2
+; GFX9-NEXT: v_mad_i32_i24 v16, v10, v9, s0
+; GFX9-NEXT: v_bfe_i32 v6, v6, 0, 4
+; GFX9-NEXT: v_bfe_i32 v5, v5, 0, 4
+; GFX9-NEXT: v_mul_i32_i24_e32 v2, v2, v8
+; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v7
+; GFX9-NEXT: v_mad_i32_i24 v8, v10, v9, v16
+; GFX9-NEXT: v_bfe_i32 v4, v4, 0, 4
+; GFX9-NEXT: v_bfe_i32 v3, v3, 0, 4
+; GFX9-NEXT: v_mul_i32_i24_e32 v6, v12, v6
+; GFX9-NEXT: v_mul_i32_i24_e32 v5, v13, v5
+; GFX9-NEXT: v_add3_u32 v2, v8, v2, v7
+; GFX9-NEXT: v_mul_i32_i24_e32 v4, v14, v4
+; GFX9-NEXT: v_mul_i32_i24_e32 v3, v15, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v6, v5
+; GFX9-NEXT: v_add3_u32 v2, v2, v4, v3
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v16
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
@@ -1517,42 +1775,61 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4
+; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 4
-; GFX9-DL-NEXT: v_bfe_i32 v5, v1, 4, 4
-; GFX9-DL-NEXT: v_bfe_i32 v6, v2, 4, 4
-; GFX9-DL-NEXT: v_bfe_i32 v7, v1, 8, 4
-; GFX9-DL-NEXT: v_bfe_i32 v8, v2, 8, 4
-; GFX9-DL-NEXT: v_bfe_i32 v9, v1, 12, 4
-; GFX9-DL-NEXT: v_bfe_i32 v10, v2, 12, 4
-; GFX9-DL-NEXT: v_bfe_i32 v11, v1, 16, 4
-; GFX9-DL-NEXT: v_bfe_i32 v12, v2, 16, 4
-; GFX9-DL-NEXT: v_bfe_i32 v13, v1, 20, 4
-; GFX9-DL-NEXT: v_bfe_i32 v14, v2, 20, 4
-; GFX9-DL-NEXT: v_bfe_i32 v15, v1, 24, 4
-; GFX9-DL-NEXT: v_bfe_i32 v16, v2, 24, 4
-; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v11, 24, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v12, 20, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v13, 16, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v14, 12, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v15, 8, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v16, 4, v2
+; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 24, 4
+; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 20, 4
+; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 16, 4
+; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
+; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
+; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 4, 4
+; GFX9-DL-NEXT: v_bfe_i32 v10, v2, 0, 4
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX9-DL-NEXT: v_bfe_i32 v9, v9, 0, 4
+; GFX9-DL-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX9-DL-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX9-DL-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xf0000000, v16
+; GFX9-DL-NEXT: v_bfe_i32 v8, v8, 0, 4
+; GFX9-DL-NEXT: v_bfe_i32 v7, v7, 0, 4
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v2, v1
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v11
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v11, 28, v12
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v12, 28, v13
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v13, 28, v14
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v14, 28, v15
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v15, 28, v16
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s0
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v7, v8
-; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v2
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v7, v9, v10
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v8, v11, v12
-; GFX9-DL-NEXT: v_add3_u32 v3, v3, v5, v6
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v9, v13, v14
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v10, v15, v16
-; GFX9-DL-NEXT: v_add3_u32 v3, v3, v7, v8
-; GFX9-DL-NEXT: v_add3_u32 v3, v3, v9, v10
-; GFX9-DL-NEXT: v_add3_u32 v1, v3, v1, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v16, v10, v9, s0
+; GFX9-DL-NEXT: v_bfe_i32 v6, v6, 0, 4
+; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 4
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v2, v8
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v7, v11, v7
+; GFX9-DL-NEXT: v_mad_i32_i24 v8, v10, v9, v16
+; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 4
+; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 4
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v12, v6
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v13, v5
+; GFX9-DL-NEXT: v_add3_u32 v2, v8, v2, v7
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v14, v4
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, v15, v3
+; GFX9-DL-NEXT: v_add3_u32 v2, v2, v6, v5
+; GFX9-DL-NEXT: v_add3_u32 v2, v2, v4, v3
+; GFX9-DL-NEXT: v_add3_u32 v1, v2, v1, v16
; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1569,42 +1846,61 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
-; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, 15, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v2, 4, 4
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v1, 8, 4
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v2, 8, 4
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v2, 0, 4
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v8, v1, 12, 4
-; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4
-; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v10, 24, v2
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v11, 20, v2
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v6, v1, 8, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 4, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 0, 4
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v13, 12, v2
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v8, v8, 0, 4
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, 0xf0000000, v10
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v4, v1, 16, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v5, v1, 12, 4
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v14, 8, v2
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v15, 4, v2
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v7, 0, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v6, 0, 4
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v10, 28, v10
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v11, 28, v11
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v12, v2, 20, 4
-; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, v0, v7, v5
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 24, 4
-; GFX10-DL-XNACK-NEXT: v_bfe_i32 v13, v2, 24, 4
-; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9
-; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10
-; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
-; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12
-; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v16, v9, v8, s2
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v0, v1, 24, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_u32 v3, v1, 20, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v5, 0, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v4, 0, 4
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v12, 28, v12
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v13, 28, v13
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v7, v10, v7
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v11, v6
+; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v8, v9, v8, v16
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v3, 0, 4
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v14, 28, v14
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v9, 28, v15
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v0, 0, 4
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v5, v12, v5
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v13, v4
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v6, v8, v7, v6
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v14, v3
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v0, v9, v0
; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2
-; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v8, v6
-; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v2
-; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v4, v6, v5, v4
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v2, v1
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v4, v3, v0
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v16
; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
@@ -1621,42 +1917,61 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, 15, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v0, 4, 4
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v1, 8, 4
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v0, 8, 4
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v0, 0, 4
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v8, v1, 12, 4
-; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4
-; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 24, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v11, 20, v0
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v6, v1, 8, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 4, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v13, 12, v0
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v8, v8, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, 0xf0000000, v10
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v4, v1, 16, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v5, v1, 12, 4
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v14, 8, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v15, 4, v0
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v7, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v6, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v10, 28, v10
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v11, 28, v11
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v12, v0, 20, 4
-; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v7, v5
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 24, 4
-; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v13, v0, 24, 4
-; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9
-; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10
-; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v3, v4
-; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12
-; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v16, v9, v8, s2
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v2, v1, 24, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v3, v1, 20, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v5, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v4, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v12, 28, v12
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v13, 28, v13
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v7, v10, v7
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v11, v6
+; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v8, v9, v8, v16
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v3, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v14, 28, v14
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v9, 28, v15
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v2, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v5, v12, v5
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v13, v4
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v6, v8, v7, v6
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v14, v3
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v2, v9, v2
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v8, v6
-; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v1, v0
-; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v4, v6, v5, v4
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v0, v1
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v4, v3, v2
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v16
; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1759,27 +2074,39 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2
; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 24, v0
; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4
; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v0
-; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
-; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
-; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
-; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4
-; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 4, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 12, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 20, v0
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
+; GFX7-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX7-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v15
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4
+; GFX7-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX7-NEXT: v_ashrrev_i32_e32 v14, 28, v14
; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0
+; GFX7-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX7-NEXT: v_ashrrev_i32_e32 v13, 28, v13
; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0
+; GFX7-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX7-NEXT: v_ashrrev_i32_e32 v12, 28, v12
; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0
+; GFX7-NEXT: v_and_b32_e32 v10, 0xf0000000, v10
+; GFX7-NEXT: v_ashrrev_i32_e32 v11, 28, v11
; GFX7-NEXT: v_mad_i32_i24 v0, v5, v12, v0
+; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v10
; GFX7-NEXT: v_mad_i32_i24 v0, v4, v11, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v3, v10, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v1, v9, v0
@@ -1814,25 +2141,37 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 4
; GFX8-NEXT: v_bfe_i32 v6, v3, 12, 4
; GFX8-NEXT: v_bfe_i32 v7, v3, 8, 4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 4, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 8, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 12, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 20, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 24, v0
; GFX8-NEXT: v_bfe_i32 v8, v3, 4, 4
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 4
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ashrrev_i32_e32 v9, 28, v0
-; GFX8-NEXT: v_bfe_i32 v10, v0, 24, 4
-; GFX8-NEXT: v_bfe_i32 v11, v0, 20, 4
-; GFX8-NEXT: v_bfe_i32 v12, v0, 16, 4
-; GFX8-NEXT: v_bfe_i32 v13, v0, 12, 4
-; GFX8-NEXT: v_bfe_i32 v14, v0, 8, 4
-; GFX8-NEXT: v_bfe_i32 v15, v0, 4, 4
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4
+; GFX8-NEXT: v_and_b32_e32 v10, 0xf0000000, v10
+; GFX8-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX8-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX8-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX8-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX8-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2
-; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0
-; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0
-; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0
-; GFX8-NEXT: v_mad_i32_i24 v0, v5, v12, v0
-; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0
-; GFX8-NEXT: v_mad_i32_i24 v0, v2, v10, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v10
+; GFX8-NEXT: v_ashrrev_i32_e32 v10, 28, v11
+; GFX8-NEXT: v_ashrrev_i32_e32 v11, 28, v12
+; GFX8-NEXT: v_ashrrev_i32_e32 v12, 28, v13
+; GFX8-NEXT: v_ashrrev_i32_e32 v13, 28, v14
+; GFX8-NEXT: v_ashrrev_i32_e32 v14, 28, v15
+; GFX8-NEXT: v_mad_i32_i24 v0, v8, v14, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v7, v13, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v6, v12, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v5, v11, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v4, v10, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v2, v3, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v1, v9, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1857,6 +2196,13 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 4, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 12, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 24, v2
; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4
; GFX9-NEXT: v_bfe_i32 v5, v1, 20, 4
; GFX9-NEXT: v_bfe_i32 v6, v1, 16, 4
@@ -1864,28 +2210,33 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-NEXT: v_bfe_i32 v8, v1, 8, 4
; GFX9-NEXT: v_bfe_i32 v9, v1, 4, 4
; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v10, 28, v2
-; GFX9-NEXT: v_bfe_i32 v11, v2, 24, 4
-; GFX9-NEXT: v_bfe_i32 v12, v2, 20, 4
-; GFX9-NEXT: v_bfe_i32 v13, v2, 16, 4
-; GFX9-NEXT: v_bfe_i32 v14, v2, 12, 4
-; GFX9-NEXT: v_bfe_i32 v15, v2, 8, 4
-; GFX9-NEXT: v_bfe_i32 v16, v2, 4, 4
; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 4
+; GFX9-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX9-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX9-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX9-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX9-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX9-NEXT: v_and_b32_e32 v16, 0xf0000000, v16
; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2
-; GFX9-NEXT: v_mul_i32_i24_e32 v2, v9, v16
-; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v15
-; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v14
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v11
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 28, v12
+; GFX9-NEXT: v_ashrrev_i32_e32 v12, 28, v13
+; GFX9-NEXT: v_ashrrev_i32_e32 v13, 28, v14
+; GFX9-NEXT: v_ashrrev_i32_e32 v14, 28, v15
+; GFX9-NEXT: v_ashrrev_i32_e32 v15, 28, v16
+; GFX9-NEXT: v_mul_i32_i24_e32 v9, v9, v15
+; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v14
+; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v13
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2
-; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v13
-; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v12
+; GFX9-NEXT: v_add3_u32 v1, v1, s0, v9
+; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v12
+; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v11
; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7
-; GFX9-NEXT: v_mul_i32_i24_e32 v4, v4, v11
+; GFX9-NEXT: v_mul_i32_i24_e32 v2, v4, v2
; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v10
; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5
-; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3
+; GFX9-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
@@ -1905,8 +2256,49 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v3, 28, v1
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v11, 4, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v12, 8, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v13, 12, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v15, 20, v2
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v16, 24, v2
+; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 24, 4
+; GFX9-DL-NEXT: v_bfe_i32 v5, v1, 20, 4
+; GFX9-DL-NEXT: v_bfe_i32 v6, v1, 16, 4
+; GFX9-DL-NEXT: v_bfe_i32 v7, v1, 12, 4
+; GFX9-DL-NEXT: v_bfe_i32 v8, v1, 8, 4
+; GFX9-DL-NEXT: v_bfe_i32 v9, v1, 4, 4
+; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 4
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v10, 28, v2
+; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 4
+; GFX9-DL-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX9-DL-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX9-DL-NEXT: v_and_b32_e32 v15, 0xf0000000, v15
+; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xf0000000, v16
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v11
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v11, 28, v12
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v12, 28, v13
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v13, 28, v14
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v14, 28, v15
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v15, 28, v16
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v9, v9, v15
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v8, v8, v14
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v7, v7, v13
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v9
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v6, v12
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v11
+; GFX9-DL-NEXT: v_add3_u32 v1, v1, v8, v7
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v4, v2
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, v3, v10
+; GFX9-DL-NEXT: v_add3_u32 v1, v1, v6, v5
+; GFX9-DL-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1925,11 +2317,52 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
-; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v1
+; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v11, 24, v2
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v13, 20, v2
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 24, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v1, 20, 4
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v1, 16, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 12, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 8, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v8, v1, 4, 4
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v1, v1, 0, 4
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v10, 8, v2
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v14, 12, v2
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX10-DL-XNACK-NEXT: v_bfe_i32 v15, v2, 0, 4
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v11, 28, v11
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v9, 4, v2
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, 0xf0000000, v10
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v12, 28, v12
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v13, 28, v13
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v15
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v11
+; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 0xf0000000, v9
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v10, 28, v10
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v11, 28, v14
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v7, v7, v13
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v12
+; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v1, v1, s2, v8
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v8, 28, v9
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v5, v5, v11
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v4, v10
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v1, v1, v7, v6
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v0, v0, v2
+; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v8
+; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v1, v1, v5, v4
+; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v1, v3, v0
+; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul:
@@ -1938,7 +2371,6 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
@@ -1949,8 +2381,50 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
+; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v1
+; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v11, 24, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v13, 20, v0
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 24, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v1, 20, 4
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, 0xf0000000, v11
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v1, 16, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 12, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 8, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v8, v1, 4, 4
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v1, v1, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 8, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v14, 12, v0
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xf0000000, v12
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 0xf0000000, v13
+; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v15, v0, 0, 4
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v11, 28, v11
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v9, 4, v0
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, 0xf0000000, v10
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v14, 0xf0000000, v14
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v12, 28, v12
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v13, 28, v13
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v15
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v11
+; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 0xf0000000, v9
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v10, 28, v10
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v11, 28, v14
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v7, v7, v13
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v12
+; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v1, s2, v8
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v8, 28, v9
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v5, v5, v11
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v4, v10
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v1, v7, v6
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v2, v0
+; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v8
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v1, v5, v4
+; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -2007,55 +2481,69 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
+; GFX7-NEXT: v_mov_b32_e32 v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
+; GFX7-NEXT: v_bfe_i32 v1, v3, 24, 4
+; GFX7-NEXT: v_ashrrev_i32_e32 v11, 28, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 24, v3
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v11
+; GFX7-NEXT: v_bfe_i32 v7, v3, 8, 4
+; GFX7-NEXT: v_bfe_i32 v10, v3, 0, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v3
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4
-; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
-; GFX7-NEXT: v_ashrrev_i32_e32 v7, 28, v2
-; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
-; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4
-; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
-; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
-; GFX7-NEXT: v_ashrrev_i32_e32 v14, 28, v0
-; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
-; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4
-; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_bfe_u32 v15, v5, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xf0000000, v14
+; GFX7-NEXT: v_bfe_i32 v19, v5, 0, 4
+; GFX7-NEXT: v_lshr_b64 v[0:1], v[0:1], 48
+; GFX7-NEXT: v_bfe_i32 v13, v3, 12, 4
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xf0000000, v12
+; GFX7-NEXT: v_bfe_i32 v21, v5, 12, 4
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX7-NEXT: v_ashrrev_i32_e32 v11, 28, v11
+; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v19
+; GFX7-NEXT: v_bfe_i32 v9, v3, 16, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v13
+; GFX7-NEXT: v_bfe_i32 v13, v5, 8, 4
+; GFX7-NEXT: v_ashrrev_i32_e32 v22, 28, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v21
+; GFX7-NEXT: v_bfe_u32 v11, v11, 0, 16
+; GFX7-NEXT: v_bfe_u32 v15, v15, 0, 16
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v10, v1, v8
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 48
+; GFX7-NEXT: v_lshr_b64 v[6:7], v[6:7], 48
+; GFX7-NEXT: v_mad_u32_u24 v1, v11, v15, v1
+; GFX7-NEXT: v_bfe_u32 v16, v5, 20, 4
+; GFX7-NEXT: v_bfe_i32 v14, v5, 16, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v18, v13, v1
+; GFX7-NEXT: v_ashrrev_i32_e32 v20, 28, v5
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 4
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v2, v6, v1
+; GFX7-NEXT: v_bfe_i32 v12, v5, 24, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v20
+; GFX7-NEXT: v_bfe_u32 v19, v22, 0, 16
+; GFX7-NEXT: v_bfe_u32 v7, v16, 0, 16
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX7-NEXT: v_lshr_b64 v[3:4], v[4:5], 48
+; GFX7-NEXT: v_mad_u32_u24 v1, v19, v7, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v17, v12, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v0, v3, v1
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -2557,48 +3045,63 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
+; GFX7-NEXT: v_bfe_i32 v3, v2, 20, 4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX7-NEXT: v_and_b32_e32 v9, 0xf0000000, v9
; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
+; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
+; GFX7-NEXT: v_bfe_i32 v13, v0, 4, 4
; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
-; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT: v_ashrrev_i32_e32 v4, 28, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v11
+; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v13
+; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v14
+; GFX7-NEXT: v_bfe_i32 v5, v2, 24, 4
; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4
+; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4
; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2
-; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4
; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4
+; GFX7-NEXT: v_or_b32_e32 v9, v11, v9
+; GFX7-NEXT: v_or_b32_e32 v11, v13, v12
+; GFX7-NEXT: v_bfe_u32 v10, v0, 12, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
-; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
-; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4
-; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0
-; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4
-; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_bfe_u32 v13, v11, 8, 8
+; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
+; GFX7-NEXT: v_bfe_i32 v16, v0, 8, 4
+; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 4
+; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v11, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT: v_mad_u32_u24 v1, v2, v13, v1
+; GFX7-NEXT: v_bfe_u32 v4, v4, 8, 8
+; GFX7-NEXT: v_bfe_u32 v10, v10, 8, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v15, v1
+; GFX7-NEXT: v_bfe_u32 v16, v3, 8, 8
+; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX7-NEXT: v_bfe_u32 v12, v9, 8, 8
; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v10, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v3, v9, v1
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
+; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v16, v12, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v14, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v8, v0, v1
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 3828fa557731e8..4784fee4853789 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -2114,39 +2114,55 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
+; GFX7-NEXT: v_mov_b32_e32 v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
-; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v12, 15, v3
+; GFX7-NEXT: v_bfe_u32 v7, v3, 20, 4
+; GFX7-NEXT: v_bfe_u32 v13, v3, 4, 4
+; GFX7-NEXT: v_bfe_u32 v14, v3, 12, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
-; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
-; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: v_and_b32_e32 v18, 15, v5
+; GFX7-NEXT: v_bfe_u32 v20, v5, 4, 4
+; GFX7-NEXT: v_bfe_u32 v21, v5, 12, 4
+; GFX7-NEXT: v_lshr_b64 v[0:1], v[0:1], 48
+; GFX7-NEXT: v_bfe_u32 v9, v3, 24, 4
+; GFX7-NEXT: v_bfe_u32 v10, v3, 16, 4
+; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v14
+; GFX7-NEXT: v_bfe_u32 v13, v13, 0, 16
+; GFX7-NEXT: v_bfe_u32 v14, v7, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v21
+; GFX7-NEXT: v_bfe_u32 v1, v20, 0, 16
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v8, v12, v18, v8
+; GFX7-NEXT: v_bfe_u32 v17, v5, 8, 4
+; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 48
+; GFX7-NEXT: v_lshr_b64 v[6:7], v[6:7], 48
+; GFX7-NEXT: v_mad_u32_u24 v1, v13, v1, v8
+; GFX7-NEXT: v_mad_u32_u24 v1, v11, v17, v1
+; GFX7-NEXT: v_bfe_u32 v15, v5, 24, 4
+; GFX7-NEXT: v_bfe_u32 v16, v5, 16, 4
+; GFX7-NEXT: v_bfe_u32 v19, v5, 20, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 28, v5
+; GFX7-NEXT: v_mad_u32_u24 v1, v2, v6, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_bfe_u32 v19, v19, 0, 16
+; GFX7-NEXT: v_mad_u32_u24 v1, v10, v16, v1
+; GFX7-NEXT: v_lshr_b64 v[3:4], v[4:5], 48
+; GFX7-NEXT: v_mad_u32_u24 v1, v14, v19, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v0, v3, v1
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -2444,14 +2460,8 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
-; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_and_b32_e32 v7, 15, v2
+; GFX7-NEXT: v_bfe_u32 v6, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
@@ -2461,15 +2471,25 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: v_bfe_u32 v8, v2, 12, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v7, v0, v1
+; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT: v_mad_u32_u24 v0, v6, v16, v0
+; GFX7-NEXT: v_bfe_u32 v8, v8, 8, 8
+; GFX7-NEXT: v_mad_u32_u24 v0, v5, v15, v0
+; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 4
+; GFX7-NEXT: v_mad_u32_u24 v0, v8, v14, v0
+; GFX7-NEXT: v_bfe_u32 v3, v2, 20, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v2
+; GFX7-NEXT: v_mad_u32_u24 v0, v4, v13, v0
+; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT: v_mad_u32_u24 v0, v3, v12, v0
+; GFX7-NEXT: v_bfe_u32 v9, v9, 8, 8
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v11, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v9, v10, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index f736ca7cd625a3..b973f0709a70ea 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -445,18 +445,24 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s4, s[0:1], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s4, s4, 3
-; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4
-; GCN-NEXT: s_and_b32 s7, s5, 0x1010101
-; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
-; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT: s_lshl_b64 s[6:7], 0xff, s4
+; GCN-NEXT: s_and_b32 s9, s7, 0x1010101
+; GCN-NEXT: s_and_b32 s8, s6, 0x1010101
+; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
+; GCN-NEXT: s_mov_b32 s4, s2
+; GCN-NEXT: s_mov_b32 s2, s3
+; GCN-NEXT: s_mov_b32 s3, s5
+; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 32
+; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
@@ -971,19 +977,18 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32
; GCN-NEXT: s_addc_u32 s5, s5, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e64 v1, s2, 2
; GCN-NEXT: s_and_b32 s3, s3, 3
-; GCN-NEXT: v_mov_b32_e32 v1, s2
-; GCN-NEXT: v_lshrrev_b16_e64 v2, 1, s2
-; GCN-NEXT: v_lshrrev_b16_e64 v3, 2, s2
-; GCN-NEXT: v_lshrrev_b16_e64 v4, 3, s2
+; GCN-NEXT: v_and_b32_e64 v2, s2, 12
+; GCN-NEXT: v_lshrrev_b16_e32 v1, 1, v1
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: v_or_b32_e32 v0, s3, v0
-; GCN-NEXT: v_and_b32_e32 v2, 1, v2
-; GCN-NEXT: v_and_b32_e32 v3, 3, v3
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0
-; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0 offset:3
+; GCN-NEXT: v_lshrrev_b16_e32 v3, 2, v2
+; GCN-NEXT: v_lshrrev_b16_e32 v2, 3, v2
+; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0
+; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:3
; GCN-NEXT: buffer_store_byte v3, off, s[4:7], 0 offset:2
-; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1
+; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:1
; GCN-NEXT: v_mov_b32_e32 v1, 1
; GCN-NEXT: buffer_store_byte v1, v0, s[4:7], 0 offen
; GCN-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 68427e8937bb94..5b18b0d7037528 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1516,6 +1516,10 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
; SI-NEXT: s_andn2_b32 s1, s2, s0
; SI-NEXT: s_and_b32 s0, s0, 0x50005
; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: s_and_b32 s1, s0, 0xffff
+; SI-NEXT: s_lshr_b32 s0, s0, 16
+; SI-NEXT: s_lshl_b32 s0, s0, 16
+; SI-NEXT: s_or_b32 s0, s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -1603,6 +1607,10 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8
; SI-NEXT: s_andn2_b32 s4, s4, s5
; SI-NEXT: s_and_b32 s5, s5, 0x505
; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: s_and_b32 s5, s4, 0xff
+; SI-NEXT: s_lshr_b32 s4, s4, 8
+; SI-NEXT: s_lshl_b32 s4, s4, 8
+; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -1620,8 +1628,11 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8
; VI-NEXT: v_not_b32_e32 v1, v0
; VI-NEXT: v_and_b32_e32 v1, s4, v1
; VI-NEXT: v_and_b32_e32 v0, 0x505, v0
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <2 x i8> %a, i8 5, i32 %b
@@ -1690,6 +1701,10 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8
; SI-NEXT: s_andn2_b32 s4, s4, s5
; SI-NEXT: s_and_b32 s5, s5, 0x5050505
; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: s_and_b32 s5, s4, 0xffff
+; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: s_lshl_b32 s4, s4, 16
+; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -1707,6 +1722,10 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8
; VI-NEXT: s_andn2_b32 s4, s4, s5
; VI-NEXT: s_and_b32 s5, s5, 0x5050505
; VI-NEXT: s_or_b32 s4, s5, s4
+; VI-NEXT: s_and_b32 s5, s4, 0xffff
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: s_lshl_b32 s4, s4, 16
+; VI-NEXT: s_or_b32 s4, s5, s4
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
@@ -1733,6 +1752,12 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
; SI-NEXT: s_and_b32 s8, s0, 0x5050505
; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3]
+; SI-NEXT: s_mov_b32 s3, 0
+; SI-NEXT: s_mov_b32 s2, s0
+; SI-NEXT: s_mov_b32 s0, s1
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1755,6 +1780,12 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
; VI-NEXT: s_and_b32 s8, s0, 0x5050505
; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3]
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_mov_b32 s2, s0
+; VI-NEXT: s_mov_b32 s0, s1
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1777,13 +1808,13 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <1
; SI-NEXT: s_lshr_b32 s4, s11, 24
; SI-NEXT: s_cmp_lg_u32 s6, 15
; SI-NEXT: s_cselect_b32 s4, s4, 5
-; SI-NEXT: s_lshl_b32 s4, s4, 24
+; SI-NEXT: s_lshl_b32 s4, s4, 8
; SI-NEXT: s_lshr_b32 s5, s11, 16
; SI-NEXT: s_cmp_lg_u32 s6, 14
; SI-NEXT: s_cselect_b32 s5, s5, 5
; SI-NEXT: s_and_b32 s5, s5, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshr_b32 s5, s11, 8
; SI-NEXT: s_cmp_lg_u32 s6, 13
; SI-NEXT: s_cselect_b32 s5, s5, 5
@@ -1797,13 +1828,13 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <1
; SI-NEXT: s_lshr_b32 s5, s10, 24
; SI-NEXT: s_cmp_lg_u32 s6, 11
; SI-NEXT: s_cselect_b32 s5, s5, 5
-; SI-NEXT: s_lshl_b32 s5, s5, 24
+; SI-NEXT: s_lshl_b32 s5, s5, 8
; SI-NEXT: s_lshr_b32 s7, s10, 16
; SI-NEXT: s_cmp_lg_u32 s6, 10
; SI-NEXT: s_cselect_b32 s7, s7, 5
; SI-NEXT: s_and_b32 s7, s7, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 16
-; SI-NEXT: s_or_b32 s5, s5, s7
+; SI-NEXT: s_or_b32 s5, s7, s5
+; SI-NEXT: s_lshl_b32 s5, s5, 16
; SI-NEXT: s_lshr_b32 s7, s10, 8
; SI-NEXT: s_cmp_lg_u32 s6, 9
; SI-NEXT: s_cselect_b32 s7, s7, 5
@@ -1817,13 +1848,13 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <1
; SI-NEXT: s_lshr_b32 s7, s9, 24
; SI-NEXT: s_cmp_lg_u32 s6, 7
; SI-NEXT: s_cselect_b32 s7, s7, 5
-; SI-NEXT: s_lshl_b32 s7, s7, 24
+; SI-NEXT: s_lshl_b32 s7, s7, 8
; SI-NEXT: s_lshr_b32 s10, s9, 16
; SI-NEXT: s_cmp_lg_u32 s6, 6
; SI-NEXT: s_cselect_b32 s10, s10, 5
; SI-NEXT: s_and_b32 s10, s10, 0xff
-; SI-NEXT: s_lshl_b32 s10, s10, 16
-; SI-NEXT: s_or_b32 s7, s7, s10
+; SI-NEXT: s_or_b32 s7, s10, s7
+; SI-NEXT: s_lshl_b32 s7, s7, 16
; SI-NEXT: s_lshr_b32 s10, s9, 8
; SI-NEXT: s_cmp_lg_u32 s6, 5
; SI-NEXT: s_cselect_b32 s10, s10, 5
@@ -1837,13 +1868,13 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <1
; SI-NEXT: s_lshr_b32 s9, s8, 24
; SI-NEXT: s_cmp_lg_u32 s6, 3
; SI-NEXT: s_cselect_b32 s9, s9, 5
-; SI-NEXT: s_lshl_b32 s9, s9, 24
+; SI-NEXT: s_lshl_b32 s9, s9, 8
; SI-NEXT: s_lshr_b32 s10, s8, 16
; SI-NEXT: s_cmp_lg_u32 s6, 2
; SI-NEXT: s_cselect_b32 s10, s10, 5
; SI-NEXT: s_and_b32 s10, s10, 0xff
-; SI-NEXT: s_lshl_b32 s10, s10, 16
-; SI-NEXT: s_or_b32 s9, s9, s10
+; SI-NEXT: s_or_b32 s9, s10, s9
+; SI-NEXT: s_lshl_b32 s9, s9, 16
; SI-NEXT: s_lshr_b32 s10, s8, 8
; SI-NEXT: s_cmp_lg_u32 s6, 1
; SI-NEXT: s_cselect_b32 s10, s10, 5
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 1ba2491d2210ec..0acbe9e3d5aa12 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -25,7 +25,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000
+; CIVI-NEXT: s_lshr_b32 s0, s2, 16
+; CIVI-NEXT: s_lshl_b32 s0, s0, 16
; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7
; CIVI-NEXT: v_mov_b32_e32 v2, s0
; CIVI-NEXT: flat_store_dword v[0:1], v2
@@ -71,11 +72,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; VI-NEXT: s_load_dword s4, s[4:5], 0x30
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_and_b32 s0, s4, 0xffff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
+; VI-NEXT: s_lshr_b32 s1, s2, 16
+; VI-NEXT: s_lshl_b32 s1, s1, 16
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -91,7 +93,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b32 s1, s4, 0xffff
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
+; CI-NEXT: s_lshr_b32 s0, s2, 16
+; CI-NEXT: s_lshl_b32 s0, s0, 16
; CI-NEXT: s_or_b32 s0, s1, s0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -142,12 +145,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; VI-NEXT: s_load_dword s4, s[4:5], 0x30
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_and_b32 s0, s4, 0xffff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff0000
+; VI-NEXT: s_lshl_b32 s2, s1, 16
; VI-NEXT: s_or_b32 s0, s0, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -162,17 +165,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
-; CI-NEXT: s_or_b32 s0, s0, s1
+; CI-NEXT: s_lshr_b32 s1, s2, 16
+; CI-NEXT: s_lshl_b32 s2, s1, 16
+; CI-NEXT: s_or_b32 s0, s0, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_lshr_b32 s2, s2, 16
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
-; CI-NEXT: ; use s2
+; CI-NEXT: ; use s1
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
;
@@ -594,7 +597,8 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000
+; CIVI-NEXT: s_lshr_b32 s0, s2, 16
+; CIVI-NEXT: s_lshl_b32 s0, s0, 16
; CIVI-NEXT: s_or_b32 s0, s0, 0x4500
; CIVI-NEXT: v_mov_b32_e32 v2, s0
; CIVI-NEXT: flat_store_dword v[0:1], v2
@@ -689,11 +693,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -707,11 +712,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -838,11 +844,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v2, 53, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -856,11 +863,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v2, 53, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -1055,11 +1063,12 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -1073,11 +1082,12 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -1127,11 +1137,12 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v2, 53, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -1145,11 +1156,12 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v2, 53, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -1374,6 +1386,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_andn2_b32 s1, s2, s0
; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7
; CI-NEXT: s_or_b32 s0, s0, s1
+; CI-NEXT: s_and_b32 s1, s0, 0xffff
+; CI-NEXT: s_lshr_b32 s0, s0, 16
+; CI-NEXT: s_lshl_b32 s0, s0, 16
+; CI-NEXT: s_or_b32 s0, s1, s0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -1448,19 +1464,23 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: v_mov_b32_e32 v4, 0x3e703e7
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: s_lshl_b32 s0, s4, 4
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
-; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
+; CI-NEXT: s_lshl_b32 s1, s4, 4
+; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; CI-NEXT: s_lshl_b32 s0, 0xffff, s1
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_bfi_b32 v2, s0, v2, v3
+; CI-NEXT: v_bfi_b32 v2, s0, v4, v3
+; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -1551,14 +1571,18 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; CI-NEXT: s_mov_b32 s2, 0x12341234
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_mov_b32 s0, 0x12341234
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4
; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_bfi_b32 v2, v2, s0, v3
+; CI-NEXT: v_bfi_b32 v2, v2, s2, v3
+; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -1632,20 +1656,24 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
-; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v3
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_mov_b32 s0, 0xffff
-; CI-NEXT: v_mov_b32_e32 v4, s4
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v3
+; CI-NEXT: s_and_b32 s0, s4, 0xffff
+; CI-NEXT: v_mov_b32_e32 v4, s1
+; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_bfi_b32 v0, s0, v4, v0
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; CI-NEXT: v_lshl_b64 v[0:1], v[1:2], 32
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; CI-NEXT: v_or_b32_e32 v2, s0, v2
+; CI-NEXT: v_or_b32_e32 v0, v2, v0
+; CI-NEXT: flat_store_dwordx2 v[3:4], v[0:1]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4f16_0:
@@ -1710,20 +1738,23 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
-; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v3
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v3
; CI-NEXT: s_lshl_b32 s0, s4, 16
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: v_mov_b32_e32 v4, s1
+; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT: v_or_b32_e32 v0, s0, v0
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: v_and_b32_e32 v5, 0xffff, v0
+; CI-NEXT: v_lshl_b64 v[0:1], v[1:2], 32
+; CI-NEXT: v_or_b32_e32 v2, s0, v5
+; CI-NEXT: v_or_b32_e32 v0, v2, v0
+; CI-NEXT: flat_store_dwordx2 v[3:4], v[0:1]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4f16_1:
@@ -1789,20 +1820,24 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
-; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v3
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_mov_b32 s0, 0xffff
-; CI-NEXT: v_mov_b32_e32 v4, s4
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: v_mov_b32_e32 v4, s1
+; CI-NEXT: s_and_b32 s1, s4, 0xffff
+; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v3
+; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v1, s1, v1
+; CI-NEXT: v_lshl_b64 v[1:2], v[1:2], 32
+; CI-NEXT: v_or_b32_e32 v1, v0, v1
+; CI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4f16_2:
@@ -1867,20 +1902,23 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
-; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v3
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_lshl_b32 s0, s4, 16
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: v_mov_b32_e32 v4, s1
+; CI-NEXT: s_lshl_b32 s1, s4, 16
+; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v3
+; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CI-NEXT: v_or_b32_e32 v1, s0, v1
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: v_or_b32_e32 v1, s1, v1
+; CI-NEXT: v_lshl_b64 v[1:2], v[1:2], 32
+; CI-NEXT: v_or_b32_e32 v1, v0, v1
+; CI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4f16_3:
@@ -1946,20 +1984,24 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
-; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v3
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_mov_b32 s0, 0xffff
-; CI-NEXT: v_mov_b32_e32 v4, s4
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: v_mov_b32_e32 v4, s1
+; CI-NEXT: s_and_b32 s1, s4, 0xffff
+; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v3
+; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v1, s1, v1
+; CI-NEXT: v_lshl_b64 v[1:2], v[1:2], 32
+; CI-NEXT: v_or_b32_e32 v1, v0, v1
+; CI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4i16_2:
@@ -2042,6 +2084,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
+; CI-NEXT: v_mov_b32_e32 v6, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -2057,9 +2100,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_bfi_b32 v1, v5, s0, v1
+; CI-NEXT: v_bfi_b32 v5, v5, s0, v1
+; CI-NEXT: v_lshl_b64 v[5:6], v[5:6], 32
; CI-NEXT: v_bfi_b32 v0, v4, s0, v0
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: v_or_b32_e32 v5, v0, v5
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[5:6]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr:
@@ -2146,6 +2191,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: v_mov_b32_e32 v5, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -2159,12 +2205,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3
; CI-NEXT: v_mov_b32_e32 v4, s2
-; CI-NEXT: v_mov_b32_e32 v5, s2
+; CI-NEXT: v_mov_b32_e32 v6, s2
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_bfi_b32 v1, s1, v4, v1
-; CI-NEXT: v_bfi_b32 v0, s0, v5, v0
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: v_bfi_b32 v4, s1, v4, v1
+; CI-NEXT: v_lshl_b64 v[4:5], v[4:5], 32
+; CI-NEXT: v_bfi_b32 v0, s0, v6, v0
+; CI-NEXT: v_or_b32_e32 v4, v0, v4
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr:
@@ -2220,12 +2268,18 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT: s_lshl_b32 s0, s4, 16
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s0
+; VI-NEXT: s_lshl_b32 s1, s4, 16
+; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v6, s1
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -2235,19 +2289,24 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: v_mov_b32_e32 v5, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
-; CI-NEXT: s_lshl_b32 s0, s4, 16
-; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CI-NEXT: v_mov_b32_e32 v6, s1
+; CI-NEXT: s_lshl_b32 s1, s4, 16
+; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v4
+; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CI-NEXT: v_or_b32_e32 v1, s0, v1
-; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; CI-NEXT: v_or_b32_e32 v4, s1, v1
+; CI-NEXT: v_lshl_b64 v[4:5], v[4:5], 32
+; CI-NEXT: v_mov_b32_e32 v6, v2
+; CI-NEXT: v_or_b32_e32 v4, v0, v4
+; CI-NEXT: v_mov_b32_e32 v7, v3
+; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v8f16_3:
@@ -2301,12 +2360,19 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_and_b32 s1, s4, 0xffff
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT: s_mov_b32 s0, 0xffff
-; VI-NEXT: v_mov_b32_e32 v6, s4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_bfi_b32 v3, s0, v6, v3
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; VI-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v3, s1, v3
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2314,20 +2380,25 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
-; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
+; CI-NEXT: v_mov_b32_e32 v4, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v5
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
-; CI-NEXT: s_mov_b32 s0, 0xffff
-; CI-NEXT: v_mov_b32_e32 v6, s4
-; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CI-NEXT: v_mov_b32_e32 v6, s1
+; CI-NEXT: s_and_b32 s1, s4, 0xffff
+; CI-NEXT: v_add_i32_e32 v5, vcc, s0, v5
+; CI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_bfi_b32 v3, s0, v6, v3
-; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v3, s1, v3
+; CI-NEXT: v_lshl_b64 v[3:4], v[3:4], 32
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_mov_b32_e32 v3, v4
+; CI-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v8i16_6:
@@ -2605,25 +2676,46 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
+; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: s_lshl_b32 s1, s4, 16
; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
+; VI-NEXT: v_mov_b32_e32 v10, s1
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; VI-NEXT: s_lshl_b32 s1, s4, 16
-; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
-; VI-NEXT: v_mov_b32_e32 v12, s1
-; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v4
+; VI-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; VI-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v8
+; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v5, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v16f16_3:
@@ -2733,22 +2825,23 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
-; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CI-NEXT: v_mov_b32_e32 v0, s3
+; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8
+; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
+; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
+; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4
+; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; CI-NEXT: v_mov_b32_e32 v9, s1
; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8
; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8
-; CI-NEXT: s_mov_b32 s2, 0xffff
-; CI-NEXT: v_mov_b32_e32 v12, s4
+; CI-NEXT: s_and_b32 s1, s4, 0xffff
; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_bfi_b32 v3, s2, v12, v3
+; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v3, s1, v3
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 526ee5a51745d3..65d93b313d672c 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -349,34 +349,38 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i16:
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0
; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1
+; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v4, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v3, v1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v4, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v4, v2, 1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v5, v3, 1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v2, 1
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2
+; GFX67-SDAG-NEXT: v_add_i32_e32 v5, vcc, s4, v5
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x10000, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v3
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v2
; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16:
@@ -532,45 +536,45 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v1
+; GFX67-SDAG-NEXT: v_bfe_u32 v7, v1, 0, 16
+; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v8, v4
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v6, v3, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v8, v4, v1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v3, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v7, v4
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v3, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v7, v4, v1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v3, 1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v7, v5, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v0, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v6, v5, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v0, v3
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4
-; GFX67-SDAG-NEXT: v_or_b32_e32 v6, v9, v6
+; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8
; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v3, 1
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, s4, v6
+; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v5
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v7, v5, 1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v5, 1
; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v5, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v5, 1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v7
+; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4
; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v6
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v7
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
@@ -841,77 +845,80 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX67-SDAG-LABEL: clpeak_imad_pat_v4i16:
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v3
+; GFX67-SDAG-NEXT: v_add_i32_e32 v11, vcc, 1, v3
+; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v2
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v11
+; GFX67-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX67-SDAG-NEXT: v_add_i32_e32 v10, vcc, 1, v0
+; GFX67-SDAG-NEXT: v_add_i32_e32 v12, vcc, 1, v1
+; GFX67-SDAG-NEXT: v_lshr_b64 v[0:1], v[2:3], 48
; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v11, v7, v3
-; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX67-SDAG-NEXT: v_bfe_u32 v13, v12, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v11, v7
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v16, v0, v7
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v7, v11
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v9, v6, v8
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v13, v5, v12
+; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v1
+; GFX67-SDAG-NEXT: v_bfe_u32 v11, v0, 0, 16
+; GFX67-SDAG-NEXT: v_lshr_b64 v[0:1], v[2:3], 48
+; GFX67-SDAG-NEXT: v_and_b32_e32 v14, 0xffff, v10
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v7
; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v9, v6, 1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v10, v5
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v9, v6, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v4, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v10, v5, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v13
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v4, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v16
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v15, v13, v5
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v10, v14, v4, v10
; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v9, v7
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v10, v0, v4
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v9, v14, v4, 1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v10
+; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v15
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v1, v4
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v11, v5
; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
-; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v2, v6
-; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v6, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX67-SDAG-NEXT: v_or_b32_e32 v9, v10, v9
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v4, 1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v8, v6
+; GFX67-SDAG-NEXT: v_add_i32_e32 v9, vcc, s4, v9
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v8, v6, 1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX67-SDAG-NEXT: v_add_i32_e32 v7, vcc, s4, v7
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX67-SDAG-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v9
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v8
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v10
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4
+; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX67-SDAG-NEXT: v_or_b32_e32 v6, v8, v6
+; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, s4, v1
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v9
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4
; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v7
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, s4, v2
-; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v9, v8
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6
-; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, s4, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v5
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v0, v8
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v7, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v3
+; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v7
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v8
; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX67-GISEL-LABEL: clpeak_imad_pat_v4i16:
@@ -1389,34 +1396,38 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX67-SDAG-LABEL: clpeak_umad_pat_v2i16:
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0
; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1
+; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v4, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v3, v1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v4, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v4, v2, 1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v5, v3, 1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v2, 1
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2
+; GFX67-SDAG-NEXT: v_add_i32_e32 v5, vcc, s4, v5
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x10000, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v3
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v2
; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16:
@@ -1572,45 +1583,45 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v1
+; GFX67-SDAG-NEXT: v_bfe_u32 v7, v1, 0, 16
+; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v8, v4
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v6, v3, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v8, v4, v1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v3, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v7, v4
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v3, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v7, v4, v1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v3, 1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v7, v5, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v0, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v6, v5, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v0, v3
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4
-; GFX67-SDAG-NEXT: v_or_b32_e32 v6, v9, v6
+; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8
; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v3, 1
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, s4, v6
+; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v5
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v7, v5, 1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v5, 1
; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v5, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v5, 1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v7
+; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4
; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v6
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v7
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
@@ -1881,77 +1892,80 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX67-SDAG-LABEL: clpeak_umad_pat_v4i16:
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v3
+; GFX67-SDAG-NEXT: v_add_i32_e32 v11, vcc, 1, v3
+; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v2
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v11
+; GFX67-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX67-SDAG-NEXT: v_add_i32_e32 v10, vcc, 1, v0
+; GFX67-SDAG-NEXT: v_add_i32_e32 v12, vcc, 1, v1
+; GFX67-SDAG-NEXT: v_lshr_b64 v[0:1], v[2:3], 48
; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v11, v7, v3
-; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX67-SDAG-NEXT: v_bfe_u32 v13, v12, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v11, v7
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v16, v0, v7
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v7, v11
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v9, v6, v8
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v13, v5, v12
+; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v1
+; GFX67-SDAG-NEXT: v_bfe_u32 v11, v0, 0, 16
+; GFX67-SDAG-NEXT: v_lshr_b64 v[0:1], v[2:3], 48
+; GFX67-SDAG-NEXT: v_and_b32_e32 v14, 0xffff, v10
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v7
; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v9, v6, 1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v10, v5
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v9, v6, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v4, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v10, v5, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v13
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v4, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v16
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v15, v13, v5
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v10, v14, v4, v10
; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v9, v7
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v10, v0, v4
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v9, v14, v4, 1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v10
+; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v15
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v1, v4
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v11, v5
; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
-; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v2, v6
-; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v6, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX67-SDAG-NEXT: v_or_b32_e32 v9, v10, v9
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v4, 1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v8, v6
+; GFX67-SDAG-NEXT: v_add_i32_e32 v9, vcc, s4, v9
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v8, v6, 1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX67-SDAG-NEXT: v_add_i32_e32 v7, vcc, s4, v7
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX67-SDAG-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v9
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v8
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v10
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4
+; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX67-SDAG-NEXT: v_or_b32_e32 v6, v8, v6
+; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, s4, v1
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v9
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4
; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v7
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, s4, v2
-; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v9, v8
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6
-; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, s4, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v5
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v0, v8
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v7, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v3
+; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v7
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v8
; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX67-GISEL-LABEL: clpeak_umad_pat_v4i16:
@@ -3841,22 +3855,24 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v1
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v1
+; GFX67-SDAG-NEXT: v_bfe_u32 v4, v4, 8, 8
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v3, v1
; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v0
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v5, v3
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v1, 8, 8
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v4, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v4, v2, 1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v2, 1
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v6
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v0, v2
-; GFX67-SDAG-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v5, 8, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2
+; GFX67-SDAG-NEXT: v_or_b32_e32 v3, v5, v3
; GFX67-SDAG-NEXT: s_movk_i32 s4, 0x100
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1
; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, s4, v3
@@ -3864,7 +3880,7 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v1
; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v2, v0
; GFX67-SDAG-NEXT: v_bfe_u32 v2, v3, 8, 8
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 0x100, v0
@@ -3927,10 +3943,10 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX8-SDAG-NEXT: v_mad_u16 v3, v1, v4, v1
; GFX8-SDAG-NEXT: v_mad_u16 v2, v0, v5, v0
; GFX8-SDAG-NEXT: v_mad_u16 v1, v3, v1, v3
-; GFX8-SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v1
+; GFX8-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX8-SDAG-NEXT: v_mad_u16 v0, v2, v0, v2
-; GFX8-SDAG-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX8-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i8:
@@ -3966,10 +3982,10 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX9-SDAG-NEXT: v_mad_legacy_u16 v3, v1, v4, v1
; GFX9-SDAG-NEXT: v_mad_legacy_u16 v2, v0, v5, v0
; GFX9-SDAG-NEXT: v_mad_legacy_u16 v1, v3, v1, v3
-; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v1
+; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_mad_legacy_u16 v0, v2, v0, v2
-; GFX9-SDAG-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX9-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: clpeak_imad_pat_v2i8:
@@ -4006,9 +4022,9 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX10-SDAG-NEXT: v_mad_u16 v0, v2, v0, v2
; GFX10-SDAG-NEXT: v_mad_u16 v1, v1, v3, v1
; GFX10-SDAG-NEXT: v_mad_u16 v0, v0, v2, v0
-; GFX10-SDAG-NEXT: v_lshlrev_b16 v2, 8, v1
-; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-SDAG-NEXT: v_lshrrev_b16 v1, 8, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i8:
@@ -4052,11 +4068,11 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX11-SDAG-NEXT: v_mad_u16 v1, v1, v3, v1
; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v2, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_lshlrev_b16 v2, 8, v1
+; GFX11-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: v_lshrrev_b16 v1, 8, v1
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i8:
@@ -6972,58 +6988,62 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i16_x2:
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0
; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1
+; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v4, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v3, v1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v4, v2, 1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v3, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v4, v3, 1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v5, v2, 1
+; GFX67-SDAG-NEXT: v_bfe_u32 v4, v3, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v4, v2
+; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v4, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v5, v2
+; GFX67-SDAG-NEXT: v_bfe_u32 v3, v3, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v5, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v5, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v4, 1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v5, 1
+; GFX67-SDAG-NEXT: v_bfe_u32 v4, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v2, v4, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v4, v1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v3, v4
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v2, v5, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v5, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v2, v4, 1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v3, v5, 1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v2, v5, 1
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2
+; GFX67-SDAG-NEXT: v_add_i32_e32 v5, vcc, s4, v5
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x10000, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v3
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v2
; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16_x2:
@@ -7280,58 +7300,62 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
; GFX67-SDAG-LABEL: clpeak_umad_pat_v2i16_x2:
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0
; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1
+; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v4, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v3, v1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v4, v2, 1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v3, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v4, v3, 1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v5, v2, 1
+; GFX67-SDAG-NEXT: v_bfe_u32 v4, v3, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v4, v2
+; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v4, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v5, v2
+; GFX67-SDAG-NEXT: v_bfe_u32 v3, v3, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v5, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v5, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v4, 1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v5, 1
+; GFX67-SDAG-NEXT: v_bfe_u32 v4, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v2, v4, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v4, v1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v3, v4
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v2, v5, v0
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v5, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v2, v4, 1
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v3, v5, 1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v2, v5, 1
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2
+; GFX67-SDAG-NEXT: v_add_i32_e32 v5, vcc, s4, v5
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2
-; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x10000, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v3
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v2
; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16_x2:
@@ -7897,8 +7921,8 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v4, v0
; GFX67-SDAG-NEXT: v_or_b32_e32 v2, v2, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v5
-; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v5, 0, 16
+; GFX67-SDAG-NEXT: v_bfe_u32 v3, v3, 0, 16
; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX67-GISEL-LABEL: multi_use_mul_mad_v2i16_var:
@@ -8003,7 +8027,7 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v7, v8
; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v3
; GFX67-SDAG-NEXT: s_mov_b32 m0, -1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-SDAG-NEXT: ds_write_b32 v6, v7
; GFX67-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 69f181fcede30f..d947fe68413dec 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -2420,21 +2420,21 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @36, KC0[], KC1[]
; EG-NEXT: TEX 0 @20
-; EG-NEXT: ALU 5, @38, KC0[], KC1[]
+; EG-NEXT: ALU 7, @38, KC0[], KC1[]
; EG-NEXT: TEX 0 @22
-; EG-NEXT: ALU 5, @44, KC0[], KC1[]
+; EG-NEXT: ALU 7, @46, KC0[], KC1[]
; EG-NEXT: TEX 0 @24
-; EG-NEXT: ALU 7, @50, KC0[], KC1[]
+; EG-NEXT: ALU 7, @54, KC0[], KC1[]
; EG-NEXT: TEX 0 @26
-; EG-NEXT: ALU 7, @58, KC0[], KC1[]
+; EG-NEXT: ALU 7, @62, KC0[], KC1[]
; EG-NEXT: TEX 0 @28
-; EG-NEXT: ALU 7, @66, KC0[], KC1[]
+; EG-NEXT: ALU 7, @70, KC0[], KC1[]
; EG-NEXT: TEX 0 @30
-; EG-NEXT: ALU 7, @74, KC0[], KC1[]
+; EG-NEXT: ALU 7, @78, KC0[], KC1[]
; EG-NEXT: TEX 0 @32
-; EG-NEXT: ALU 5, @82, KC0[], KC1[]
+; EG-NEXT: ALU 5, @86, KC0[], KC1[]
; EG-NEXT: TEX 0 @34
-; EG-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 5, @92, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -2458,20 +2458,24 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: MOV * T5.X, 0.0,
; EG-NEXT: ALU clause starting at 38:
-; EG-NEXT: LSHL T0.W, T6.X, literal.x,
+; EG-NEXT: AND_INT * T0.W, T6.X, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 44:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T6.X, literal.y,
-; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 46:
+; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 16777215(2.350989e-38)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 50:
+; EG-NEXT: ALU clause starting at 54:
; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
@@ -2480,7 +2484,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 58:
+; EG-NEXT: ALU clause starting at 62:
; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
@@ -2489,7 +2493,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 66:
+; EG-NEXT: ALU clause starting at 70:
; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
@@ -2498,7 +2502,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 74:
+; EG-NEXT: ALU clause starting at 78:
; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
@@ -2507,14 +2511,14 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 82:
+; EG-NEXT: ALU clause starting at 86:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T6.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T5.Y, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 88:
+; EG-NEXT: ALU clause starting at 92:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T5.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
@@ -2526,21 +2530,21 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 1, @36, KC0[], KC1[]
; CM-NEXT: TEX 0 @20
-; CM-NEXT: ALU 5, @38, KC0[], KC1[]
+; CM-NEXT: ALU 7, @38, KC0[], KC1[]
; CM-NEXT: TEX 0 @22
-; CM-NEXT: ALU 5, @44, KC0[], KC1[]
+; CM-NEXT: ALU 7, @46, KC0[], KC1[]
; CM-NEXT: TEX 0 @24
-; CM-NEXT: ALU 7, @50, KC0[], KC1[]
+; CM-NEXT: ALU 7, @54, KC0[], KC1[]
; CM-NEXT: TEX 0 @26
-; CM-NEXT: ALU 7, @58, KC0[], KC1[]
+; CM-NEXT: ALU 7, @62, KC0[], KC1[]
; CM-NEXT: TEX 0 @28
-; CM-NEXT: ALU 7, @66, KC0[], KC1[]
+; CM-NEXT: ALU 7, @70, KC0[], KC1[]
; CM-NEXT: TEX 0 @30
-; CM-NEXT: ALU 7, @74, KC0[], KC1[]
+; CM-NEXT: ALU 7, @78, KC0[], KC1[]
; CM-NEXT: TEX 0 @32
-; CM-NEXT: ALU 5, @82, KC0[], KC1[]
+; CM-NEXT: ALU 5, @86, KC0[], KC1[]
; CM-NEXT: TEX 0 @34
-; CM-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 5, @92, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
@@ -2564,20 +2568,24 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: MOV * T5.X, 0.0,
; CM-NEXT: ALU clause starting at 38:
-; CM-NEXT: LSHL T0.Z, T6.X, literal.x,
+; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: LSHL T0.Z, PV.W, literal.x,
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 44:
+; CM-NEXT: ALU clause starting at 46:
+; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T6.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 50:
+; CM-NEXT: ALU clause starting at 54:
; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -2586,7 +2594,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 58:
+; CM-NEXT: ALU clause starting at 62:
; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -2595,7 +2603,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 66:
+; CM-NEXT: ALU clause starting at 70:
; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -2604,7 +2612,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 74:
+; CM-NEXT: ALU clause starting at 78:
; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -2613,14 +2621,14 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 82:
+; CM-NEXT: ALU clause starting at 86:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
; CM-NEXT: OR_INT * T5.Y, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.Y,
; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 88:
+; CM-NEXT: ALU clause starting at 92:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T5.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
@@ -2678,21 +2686,21 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @36, KC0[], KC1[]
; EG-NEXT: TEX 0 @20
-; EG-NEXT: ALU 5, @38, KC0[], KC1[]
+; EG-NEXT: ALU 7, @38, KC0[], KC1[]
; EG-NEXT: TEX 0 @22
-; EG-NEXT: ALU 5, @44, KC0[], KC1[]
+; EG-NEXT: ALU 7, @46, KC0[], KC1[]
; EG-NEXT: TEX 0 @24
-; EG-NEXT: ALU 5, @50, KC0[], KC1[]
+; EG-NEXT: ALU 5, @54, KC0[], KC1[]
; EG-NEXT: TEX 0 @26
-; EG-NEXT: ALU 5, @56, KC0[], KC1[]
+; EG-NEXT: ALU 5, @60, KC0[], KC1[]
; EG-NEXT: TEX 0 @28
-; EG-NEXT: ALU 5, @62, KC0[], KC1[]
+; EG-NEXT: ALU 7, @66, KC0[], KC1[]
; EG-NEXT: TEX 0 @30
-; EG-NEXT: ALU 5, @68, KC0[], KC1[]
+; EG-NEXT: ALU 7, @74, KC0[], KC1[]
; EG-NEXT: TEX 0 @32
-; EG-NEXT: ALU 5, @74, KC0[], KC1[]
+; EG-NEXT: ALU 5, @82, KC0[], KC1[]
; EG-NEXT: TEX 0 @34
-; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 8, @88, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -2716,55 +2724,63 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: MOV * T7.X, 0.0,
; EG-NEXT: ALU clause starting at 38:
-; EG-NEXT: LSHL T0.W, T8.X, literal.x,
+; EG-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 44:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 46:
+; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 50:
+; EG-NEXT: ALU clause starting at 54:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 56:
+; EG-NEXT: ALU clause starting at 60:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 62:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 66:
+; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 68:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 74:
+; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 74:
+; EG-NEXT: ALU clause starting at 82:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.Z,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 80:
+; EG-NEXT: ALU clause starting at 88:
; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
; EG-NEXT: AND_INT * T1.W, T7.X, literal.z,
@@ -2779,21 +2795,21 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 1, @36, KC0[], KC1[]
; CM-NEXT: TEX 0 @20
-; CM-NEXT: ALU 5, @38, KC0[], KC1[]
+; CM-NEXT: ALU 7, @38, KC0[], KC1[]
; CM-NEXT: TEX 0 @22
-; CM-NEXT: ALU 5, @44, KC0[], KC1[]
+; CM-NEXT: ALU 7, @46, KC0[], KC1[]
; CM-NEXT: TEX 0 @24
-; CM-NEXT: ALU 5, @50, KC0[], KC1[]
+; CM-NEXT: ALU 5, @54, KC0[], KC1[]
; CM-NEXT: TEX 0 @26
-; CM-NEXT: ALU 5, @56, KC0[], KC1[]
+; CM-NEXT: ALU 5, @60, KC0[], KC1[]
; CM-NEXT: TEX 0 @28
-; CM-NEXT: ALU 5, @62, KC0[], KC1[]
+; CM-NEXT: ALU 7, @66, KC0[], KC1[]
; CM-NEXT: TEX 0 @30
-; CM-NEXT: ALU 5, @68, KC0[], KC1[]
+; CM-NEXT: ALU 7, @74, KC0[], KC1[]
; CM-NEXT: TEX 0 @32
-; CM-NEXT: ALU 5, @74, KC0[], KC1[]
+; CM-NEXT: ALU 5, @82, KC0[], KC1[]
; CM-NEXT: TEX 0 @34
-; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 8, @88, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
@@ -2817,55 +2833,63 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: MOV * T7.X, 0.0,
; CM-NEXT: ALU clause starting at 38:
-; CM-NEXT: LSHL T0.Z, T8.X, literal.x,
+; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: LSHL T0.Z, PV.W, literal.x,
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 44:
+; CM-NEXT: ALU clause starting at 46:
+; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 50:
+; CM-NEXT: ALU clause starting at 54:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 56:
+; CM-NEXT: ALU clause starting at 60:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 62:
+; CM-NEXT: ALU clause starting at 66:
+; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 68:
+; CM-NEXT: ALU clause starting at 74:
+; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 74:
+; CM-NEXT: ALU clause starting at 82:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.Z,
; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 80:
+; CM-NEXT: ALU clause starting at 88:
; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
; CM-NEXT: AND_INT * T0.W, T7.X, literal.z,
@@ -3149,37 +3173,37 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @68, KC0[], KC1[]
; EG-NEXT: TEX 0 @36
-; EG-NEXT: ALU 5, @70, KC0[], KC1[]
+; EG-NEXT: ALU 7, @70, KC0[], KC1[]
; EG-NEXT: TEX 0 @38
-; EG-NEXT: ALU 5, @76, KC0[], KC1[]
+; EG-NEXT: ALU 7, @78, KC0[], KC1[]
; EG-NEXT: TEX 0 @40
-; EG-NEXT: ALU 5, @82, KC0[], KC1[]
+; EG-NEXT: ALU 7, @86, KC0[], KC1[]
; EG-NEXT: TEX 0 @42
-; EG-NEXT: ALU 5, @88, KC0[], KC1[]
-; EG-NEXT: TEX 0 @44
; EG-NEXT: ALU 7, @94, KC0[], KC1[]
-; EG-NEXT: TEX 0 @46
+; EG-NEXT: TEX 0 @44
; EG-NEXT: ALU 7, @102, KC0[], KC1[]
-; EG-NEXT: TEX 0 @48
+; EG-NEXT: TEX 0 @46
; EG-NEXT: ALU 7, @110, KC0[], KC1[]
-; EG-NEXT: TEX 0 @50
+; EG-NEXT: TEX 0 @48
; EG-NEXT: ALU 7, @118, KC0[], KC1[]
-; EG-NEXT: TEX 0 @52
+; EG-NEXT: TEX 0 @50
; EG-NEXT: ALU 7, @126, KC0[], KC1[]
-; EG-NEXT: TEX 0 @54
+; EG-NEXT: TEX 0 @52
; EG-NEXT: ALU 7, @134, KC0[], KC1[]
-; EG-NEXT: TEX 0 @56
+; EG-NEXT: TEX 0 @54
; EG-NEXT: ALU 7, @142, KC0[], KC1[]
-; EG-NEXT: TEX 0 @58
+; EG-NEXT: TEX 0 @56
; EG-NEXT: ALU 7, @150, KC0[], KC1[]
+; EG-NEXT: TEX 0 @58
+; EG-NEXT: ALU 7, @158, KC0[], KC1[]
; EG-NEXT: TEX 0 @60
-; EG-NEXT: ALU 5, @158, KC0[], KC1[]
+; EG-NEXT: ALU 5, @166, KC0[], KC1[]
; EG-NEXT: TEX 0 @62
-; EG-NEXT: ALU 5, @164, KC0[], KC1[]
+; EG-NEXT: ALU 5, @172, KC0[], KC1[]
; EG-NEXT: TEX 0 @64
-; EG-NEXT: ALU 5, @170, KC0[], KC1[]
+; EG-NEXT: ALU 5, @178, KC0[], KC1[]
; EG-NEXT: TEX 0 @66
-; EG-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 5, @184, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -3219,34 +3243,42 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: MOV * T7.X, 0.0,
; EG-NEXT: ALU clause starting at 70:
-; EG-NEXT: LSHL T0.W, T8.X, literal.x,
+; EG-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 76:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 78:
+; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 16777215(2.350989e-38)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 82:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 86:
+; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 16777215(2.350989e-38)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 88:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 94:
+; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 16777215(2.350989e-38)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 94:
+; EG-NEXT: ALU clause starting at 102:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
@@ -3255,7 +3287,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 102:
+; EG-NEXT: ALU clause starting at 110:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
@@ -3264,7 +3296,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 110:
+; EG-NEXT: ALU clause starting at 118:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
@@ -3273,7 +3305,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 118:
+; EG-NEXT: ALU clause starting at 126:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
@@ -3282,7 +3314,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 126:
+; EG-NEXT: ALU clause starting at 134:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
@@ -3291,7 +3323,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 134:
+; EG-NEXT: ALU clause starting at 142:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
@@ -3300,7 +3332,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 142:
+; EG-NEXT: ALU clause starting at 150:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
@@ -3309,7 +3341,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 150:
+; EG-NEXT: ALU clause starting at 158:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
@@ -3318,28 +3350,28 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 158:
+; EG-NEXT: ALU clause starting at 166:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T7.W, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 164:
+; EG-NEXT: ALU clause starting at 172:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.Z,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 170:
+; EG-NEXT: ALU clause starting at 178:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T7.Y, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 176:
+; EG-NEXT: ALU clause starting at 184:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T7.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
@@ -3351,37 +3383,37 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 1, @68, KC0[], KC1[]
; CM-NEXT: TEX 0 @36
-; CM-NEXT: ALU 5, @70, KC0[], KC1[]
+; CM-NEXT: ALU 7, @70, KC0[], KC1[]
; CM-NEXT: TEX 0 @38
-; CM-NEXT: ALU 5, @76, KC0[], KC1[]
+; CM-NEXT: ALU 7, @78, KC0[], KC1[]
; CM-NEXT: TEX 0 @40
-; CM-NEXT: ALU 5, @82, KC0[], KC1[]
+; CM-NEXT: ALU 7, @86, KC0[], KC1[]
; CM-NEXT: TEX 0 @42
-; CM-NEXT: ALU 5, @88, KC0[], KC1[]
-; CM-NEXT: TEX 0 @44
; CM-NEXT: ALU 7, @94, KC0[], KC1[]
-; CM-NEXT: TEX 0 @46
+; CM-NEXT: TEX 0 @44
; CM-NEXT: ALU 7, @102, KC0[], KC1[]
-; CM-NEXT: TEX 0 @48
+; CM-NEXT: TEX 0 @46
; CM-NEXT: ALU 7, @110, KC0[], KC1[]
-; CM-NEXT: TEX 0 @50
+; CM-NEXT: TEX 0 @48
; CM-NEXT: ALU 7, @118, KC0[], KC1[]
-; CM-NEXT: TEX 0 @52
+; CM-NEXT: TEX 0 @50
; CM-NEXT: ALU 7, @126, KC0[], KC1[]
-; CM-NEXT: TEX 0 @54
+; CM-NEXT: TEX 0 @52
; CM-NEXT: ALU 7, @134, KC0[], KC1[]
-; CM-NEXT: TEX 0 @56
+; CM-NEXT: TEX 0 @54
; CM-NEXT: ALU 7, @142, KC0[], KC1[]
-; CM-NEXT: TEX 0 @58
+; CM-NEXT: TEX 0 @56
; CM-NEXT: ALU 7, @150, KC0[], KC1[]
+; CM-NEXT: TEX 0 @58
+; CM-NEXT: ALU 7, @158, KC0[], KC1[]
; CM-NEXT: TEX 0 @60
-; CM-NEXT: ALU 5, @158, KC0[], KC1[]
+; CM-NEXT: ALU 5, @166, KC0[], KC1[]
; CM-NEXT: TEX 0 @62
-; CM-NEXT: ALU 5, @164, KC0[], KC1[]
+; CM-NEXT: ALU 5, @172, KC0[], KC1[]
; CM-NEXT: TEX 0 @64
-; CM-NEXT: ALU 5, @170, KC0[], KC1[]
+; CM-NEXT: ALU 5, @178, KC0[], KC1[]
; CM-NEXT: TEX 0 @66
-; CM-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 5, @184, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
@@ -3421,34 +3453,42 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: MOV * T7.X, 0.0,
; CM-NEXT: ALU clause starting at 70:
-; CM-NEXT: LSHL T0.Z, T8.X, literal.x,
+; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: LSHL T0.Z, PV.W, literal.x,
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 76:
+; CM-NEXT: ALU clause starting at 78:
+; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 82:
+; CM-NEXT: ALU clause starting at 86:
+; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 88:
+; CM-NEXT: ALU clause starting at 94:
+; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 94:
+; CM-NEXT: ALU clause starting at 102:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -3457,7 +3497,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 102:
+; CM-NEXT: ALU clause starting at 110:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -3466,7 +3506,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 110:
+; CM-NEXT: ALU clause starting at 118:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -3475,7 +3515,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 118:
+; CM-NEXT: ALU clause starting at 126:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -3484,7 +3524,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 126:
+; CM-NEXT: ALU clause starting at 134:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -3493,7 +3533,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 134:
+; CM-NEXT: ALU clause starting at 142:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -3502,7 +3542,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 142:
+; CM-NEXT: ALU clause starting at 150:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -3511,7 +3551,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 150:
+; CM-NEXT: ALU clause starting at 158:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
@@ -3520,28 +3560,28 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 158:
+; CM-NEXT: ALU clause starting at 166:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
; CM-NEXT: OR_INT * T7.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 164:
+; CM-NEXT: ALU clause starting at 172:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.Z,
; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 170:
+; CM-NEXT: ALU clause starting at 178:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
; CM-NEXT: OR_INT * T7.Y, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.Y,
; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 176:
+; CM-NEXT: ALU clause starting at 184:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T7.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
@@ -3620,37 +3660,37 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @68, KC0[], KC1[]
; EG-NEXT: TEX 0 @36
-; EG-NEXT: ALU 5, @70, KC0[], KC1[]
+; EG-NEXT: ALU 7, @70, KC0[], KC1[]
; EG-NEXT: TEX 0 @38
-; EG-NEXT: ALU 5, @76, KC0[], KC1[]
+; EG-NEXT: ALU 7, @78, KC0[], KC1[]
; EG-NEXT: TEX 0 @40
-; EG-NEXT: ALU 5, @82, KC0[], KC1[]
+; EG-NEXT: ALU 7, @86, KC0[], KC1[]
; EG-NEXT: TEX 0 @42
-; EG-NEXT: ALU 5, @88, KC0[], KC1[]
+; EG-NEXT: ALU 7, @94, KC0[], KC1[]
; EG-NEXT: TEX 0 @44
-; EG-NEXT: ALU 5, @94, KC0[], KC1[]
+; EG-NEXT: ALU 5, @102, KC0[], KC1[]
; EG-NEXT: TEX 0 @46
-; EG-NEXT: ALU 5, @100, KC0[], KC1[]
+; EG-NEXT: ALU 5, @108, KC0[], KC1[]
; EG-NEXT: TEX 0 @48
-; EG-NEXT: ALU 5, @106, KC0[], KC1[]
+; EG-NEXT: ALU 5, @114, KC0[], KC1[]
; EG-NEXT: TEX 0 @50
-; EG-NEXT: ALU 5, @112, KC0[], KC1[]
+; EG-NEXT: ALU 5, @120, KC0[], KC1[]
; EG-NEXT: TEX 0 @52
-; EG-NEXT: ALU 5, @118, KC0[], KC1[]
+; EG-NEXT: ALU 7, @126, KC0[], KC1[]
; EG-NEXT: TEX 0 @54
-; EG-NEXT: ALU 5, @124, KC0[], KC1[]
+; EG-NEXT: ALU 7, @134, KC0[], KC1[]
; EG-NEXT: TEX 0 @56
-; EG-NEXT: ALU 5, @130, KC0[], KC1[]
+; EG-NEXT: ALU 7, @142, KC0[], KC1[]
; EG-NEXT: TEX 0 @58
-; EG-NEXT: ALU 5, @136, KC0[], KC1[]
+; EG-NEXT: ALU 7, @150, KC0[], KC1[]
; EG-NEXT: TEX 0 @60
-; EG-NEXT: ALU 5, @142, KC0[], KC1[]
+; EG-NEXT: ALU 5, @158, KC0[], KC1[]
; EG-NEXT: TEX 0 @62
-; EG-NEXT: ALU 5, @148, KC0[], KC1[]
+; EG-NEXT: ALU 5, @164, KC0[], KC1[]
; EG-NEXT: TEX 0 @64
-; EG-NEXT: ALU 5, @154, KC0[], KC1[]
+; EG-NEXT: ALU 5, @170, KC0[], KC1[]
; EG-NEXT: TEX 0 @66
-; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 13, @176, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
; EG-NEXT: CF_END
@@ -3690,111 +3730,127 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: MOV * T11.X, 0.0,
; EG-NEXT: ALU clause starting at 70:
-; EG-NEXT: LSHL T0.W, T12.X, literal.x,
+; EG-NEXT: AND_INT * T0.W, T12.X, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 76:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 78:
+; EG-NEXT: AND_INT T0.W, T12.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T7.X,
-; EG-NEXT: ALU clause starting at 82:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 86:
+; EG-NEXT: AND_INT T0.W, T12.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T7.X, PV.W,
; EG-NEXT: MOV * T0.Y, T9.X,
-; EG-NEXT: ALU clause starting at 88:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 94:
+; EG-NEXT: AND_INT T0.W, T12.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T9.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 94:
+; EG-NEXT: ALU clause starting at 102:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 100:
+; EG-NEXT: ALU clause starting at 108:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T7.X,
-; EG-NEXT: ALU clause starting at 106:
+; EG-NEXT: ALU clause starting at 114:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T7.X, PV.W,
; EG-NEXT: MOV * T0.Y, T9.X,
-; EG-NEXT: ALU clause starting at 112:
+; EG-NEXT: ALU clause starting at 120:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T9.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 118:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 126:
+; EG-NEXT: AND_INT T0.W, T12.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 124:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 134:
+; EG-NEXT: AND_INT T0.W, T12.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, T6.X,
-; EG-NEXT: ALU clause starting at 130:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 142:
+; EG-NEXT: AND_INT T0.W, T12.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T6.X, PV.W,
; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: ALU clause starting at 136:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 150:
+; EG-NEXT: AND_INT T0.W, T12.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T8.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 142:
+; EG-NEXT: ALU clause starting at 158:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T12.Z, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.Z,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 148:
+; EG-NEXT: ALU clause starting at 164:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T12.X, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.X,
; EG-NEXT: MOV * T0.Y, T6.X,
-; EG-NEXT: ALU clause starting at 154:
+; EG-NEXT: ALU clause starting at 170:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T13.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T11.Z, PV.W, PS,
; EG-NEXT: MOV T6.X, PV.Z,
; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: ALU clause starting at 160:
+; EG-NEXT: ALU clause starting at 176:
; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
@@ -3814,37 +3870,37 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 1, @68, KC0[], KC1[]
; CM-NEXT: TEX 0 @36
-; CM-NEXT: ALU 5, @70, KC0[], KC1[]
+; CM-NEXT: ALU 7, @70, KC0[], KC1[]
; CM-NEXT: TEX 0 @38
-; CM-NEXT: ALU 5, @76, KC0[], KC1[]
+; CM-NEXT: ALU 7, @78, KC0[], KC1[]
; CM-NEXT: TEX 0 @40
-; CM-NEXT: ALU 5, @82, KC0[], KC1[]
+; CM-NEXT: ALU 7, @86, KC0[], KC1[]
; CM-NEXT: TEX 0 @42
-; CM-NEXT: ALU 5, @88, KC0[], KC1[]
+; CM-NEXT: ALU 7, @94, KC0[], KC1[]
; CM-NEXT: TEX 0 @44
-; CM-NEXT: ALU 5, @94, KC0[], KC1[]
+; CM-NEXT: ALU 5, @102, KC0[], KC1[]
; CM-NEXT: TEX 0 @46
-; CM-NEXT: ALU 5, @100, KC0[], KC1[]
+; CM-NEXT: ALU 5, @108, KC0[], KC1[]
; CM-NEXT: TEX 0 @48
-; CM-NEXT: ALU 5, @106, KC0[], KC1[]
+; CM-NEXT: ALU 5, @114, KC0[], KC1[]
; CM-NEXT: TEX 0 @50
-; CM-NEXT: ALU 5, @112, KC0[], KC1[]
+; CM-NEXT: ALU 5, @120, KC0[], KC1[]
; CM-NEXT: TEX 0 @52
-; CM-NEXT: ALU 5, @118, KC0[], KC1[]
+; CM-NEXT: ALU 7, @126, KC0[], KC1[]
; CM-NEXT: TEX 0 @54
-; CM-NEXT: ALU 5, @124, KC0[], KC1[]
+; CM-NEXT: ALU 7, @134, KC0[], KC1[]
; CM-NEXT: TEX 0 @56
-; CM-NEXT: ALU 5, @130, KC0[], KC1[]
+; CM-NEXT: ALU 7, @142, KC0[], KC1[]
; CM-NEXT: TEX 0 @58
-; CM-NEXT: ALU 5, @136, KC0[], KC1[]
+; CM-NEXT: ALU 7, @150, KC0[], KC1[]
; CM-NEXT: TEX 0 @60
-; CM-NEXT: ALU 5, @142, KC0[], KC1[]
+; CM-NEXT: ALU 5, @158, KC0[], KC1[]
; CM-NEXT: TEX 0 @62
-; CM-NEXT: ALU 5, @148, KC0[], KC1[]
+; CM-NEXT: ALU 5, @164, KC0[], KC1[]
; CM-NEXT: TEX 0 @64
-; CM-NEXT: ALU 5, @154, KC0[], KC1[]
+; CM-NEXT: ALU 5, @170, KC0[], KC1[]
; CM-NEXT: TEX 0 @66
-; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 14, @176, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
; CM-NEXT: CF_END
@@ -3884,111 +3940,127 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: MOV * T11.X, 0.0,
; CM-NEXT: ALU clause starting at 70:
-; CM-NEXT: LSHL T0.Z, T12.X, literal.x,
+; CM-NEXT: AND_INT * T0.W, T12.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: LSHL T0.Z, PV.W, literal.x,
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 76:
+; CM-NEXT: ALU clause starting at 78:
+; CM-NEXT: AND_INT * T0.W, T12.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T7.X,
-; CM-NEXT: ALU clause starting at 82:
+; CM-NEXT: ALU clause starting at 86:
+; CM-NEXT: AND_INT * T0.W, T12.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T7.X, PV.W,
; CM-NEXT: MOV * T0.Y, T9.X,
-; CM-NEXT: ALU clause starting at 88:
+; CM-NEXT: ALU clause starting at 94:
+; CM-NEXT: AND_INT * T0.W, T12.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T9.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 94:
+; CM-NEXT: ALU clause starting at 102:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 100:
+; CM-NEXT: ALU clause starting at 108:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T7.X,
-; CM-NEXT: ALU clause starting at 106:
+; CM-NEXT: ALU clause starting at 114:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T7.X, PV.W,
; CM-NEXT: MOV * T0.Y, T9.X,
-; CM-NEXT: ALU clause starting at 112:
+; CM-NEXT: ALU clause starting at 120:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T9.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 118:
+; CM-NEXT: ALU clause starting at 126:
+; CM-NEXT: AND_INT * T0.W, T12.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 124:
+; CM-NEXT: ALU clause starting at 134:
+; CM-NEXT: AND_INT * T0.W, T12.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.W,
; CM-NEXT: MOV * T0.Y, T6.X,
-; CM-NEXT: ALU clause starting at 130:
+; CM-NEXT: ALU clause starting at 142:
+; CM-NEXT: AND_INT * T0.W, T12.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T6.X, PV.W,
; CM-NEXT: MOV * T0.Y, T8.X,
-; CM-NEXT: ALU clause starting at 136:
+; CM-NEXT: ALU clause starting at 150:
+; CM-NEXT: AND_INT * T0.W, T12.X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T8.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 142:
+; CM-NEXT: ALU clause starting at 158:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.Z,
; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 148:
+; CM-NEXT: ALU clause starting at 164:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.X,
; CM-NEXT: MOV * T0.Y, T6.X,
-; CM-NEXT: ALU clause starting at 154:
+; CM-NEXT: ALU clause starting at 170:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T13.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W,
; CM-NEXT: MOV T6.X, PV.Z,
; CM-NEXT: MOV * T0.Y, T8.X,
-; CM-NEXT: ALU clause starting at 160:
+; CM-NEXT: ALU clause starting at 176:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T13.X, PV.W, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index 8a9d731334ec5f..fbd4fc4dd98224 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -293,7 +293,7 @@ define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15>
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0x7fff
-; GCN-NEXT: s_bfe_u32 s2, s2, 0x100010
+; GCN-NEXT: s_lshr_b32 s2, s2, 16
; GCN-NEXT: s_lshl_b32 s2, s2, 15
; GCN-NEXT: s_or_b32 s2, s3, s2
; GCN-NEXT: s_andn2_b32 s2, s2, -2.0
@@ -319,9 +319,9 @@ define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15>
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, s3, 0xffff
; GCN-NEXT: s_and_b32 s5, s2, 0x7fff
-; GCN-NEXT: s_lshr_b32 s6, s2, 1
+; GCN-NEXT: s_bfe_u32 s6, s2, 0xf0010
; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 30
-; GCN-NEXT: s_and_b32 s4, s6, 0x3fff8000
+; GCN-NEXT: s_lshl_b32 s4, s6, 15
; GCN-NEXT: s_and_b32 s6, s3, 0x1fff
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_mov_b32 s5, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 920ff8a927e2d1..8da6c4c0e0ccab 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -16,6 +16,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -73,6 +77,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out,
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s4, s4
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -126,9 +134,23 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out,
}
define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: s_cvt_pkrtz_undef_undef:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_endpgm
+; SI-LABEL: s_cvt_pkrtz_undef_undef:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_cvt_pkrtz_undef_undef:
+; VI: ; %bb.0:
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_cvt_pkrtz_undef_undef:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_cvt_pkrtz_undef_undef:
; GFX10: ; %bb.0:
@@ -160,6 +182,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -257,6 +283,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out,
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -335,6 +365,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out,
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -417,6 +451,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out,
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -519,6 +557,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out,
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -621,6 +663,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -724,6 +770,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -817,3 +867,5 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
index 5d243e3a5890a1..776798941646ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -93,8 +93,9 @@ define void @ds_bpermute_or_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v2, 62, v2
+; CHECK-NEXT: v_or_b32_e32 v2, 1, v2
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; CHECK-NEXT: ds_bpermute_b32 v2, v2, v3 offset:4
+; CHECK-NEXT: ds_bpermute_b32 v2, v2, v3
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
index 0c6bba2426947e..b95bfbe69d4389 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
@@ -1118,14 +1118,6 @@ main_body:
}
define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
-; PREGFX10-LABEL: raw_buffer_load_v2f16:
-; PREGFX10: ; %bb.0: ; %main_body
-; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
-; PREGFX10-NEXT: s_mov_b32 m0, -1
-; PREGFX10-NEXT: s_waitcnt vmcnt(0)
-; PREGFX10-NEXT: ds_write_b32 v0, v1
-; PREGFX10-NEXT: s_endpgm
-;
; GFX10-LABEL: raw_buffer_load_v2f16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
@@ -1153,14 +1145,6 @@ main_body:
}
define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
-; PREGFX10-LABEL: raw_buffer_load_v4f16:
-; PREGFX10: ; %bb.0: ; %main_body
-; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
-; PREGFX10-NEXT: s_mov_b32 m0, -1
-; PREGFX10-NEXT: s_waitcnt vmcnt(0)
-; PREGFX10-NEXT: ds_write_b64 v0, v[1:2]
-; PREGFX10-NEXT: s_endpgm
-;
; GFX10-LABEL: raw_buffer_load_v4f16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
@@ -1188,14 +1172,6 @@ main_body:
}
define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
-; PREGFX10-LABEL: raw_buffer_load_v2i16:
-; PREGFX10: ; %bb.0: ; %main_body
-; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
-; PREGFX10-NEXT: s_mov_b32 m0, -1
-; PREGFX10-NEXT: s_waitcnt vmcnt(0)
-; PREGFX10-NEXT: ds_write_b32 v0, v1
-; PREGFX10-NEXT: s_endpgm
-;
; GFX10-LABEL: raw_buffer_load_v2i16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
@@ -1223,14 +1199,6 @@ main_body:
}
define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
-; PREGFX10-LABEL: raw_buffer_load_v4i16:
-; PREGFX10: ; %bb.0: ; %main_body
-; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
-; PREGFX10-NEXT: s_mov_b32 m0, -1
-; PREGFX10-NEXT: s_waitcnt vmcnt(0)
-; PREGFX10-NEXT: ds_write_b64 v0, v[1:2]
-; PREGFX10-NEXT: s_endpgm
-;
; GFX10-LABEL: raw_buffer_load_v4i16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index 04d221325a5912..efaec8dd7ae5a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -915,14 +915,6 @@ main_body:
}
define amdgpu_ps void @raw_ptr_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
-; PREGFX10-LABEL: raw_ptr_buffer_load_v2f16:
-; PREGFX10: ; %bb.0: ; %main_body
-; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
-; PREGFX10-NEXT: s_mov_b32 m0, -1
-; PREGFX10-NEXT: s_waitcnt vmcnt(0)
-; PREGFX10-NEXT: ds_write_b32 v0, v1
-; PREGFX10-NEXT: s_endpgm
-;
; GFX10-LABEL: raw_ptr_buffer_load_v2f16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
@@ -943,14 +935,6 @@ main_body:
}
define amdgpu_ps void @raw_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
-; PREGFX10-LABEL: raw_ptr_buffer_load_v4f16:
-; PREGFX10: ; %bb.0: ; %main_body
-; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
-; PREGFX10-NEXT: s_mov_b32 m0, -1
-; PREGFX10-NEXT: s_waitcnt vmcnt(0)
-; PREGFX10-NEXT: ds_write_b64 v0, v[1:2]
-; PREGFX10-NEXT: s_endpgm
-;
; GFX10-LABEL: raw_ptr_buffer_load_v4f16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
@@ -971,14 +955,6 @@ main_body:
}
define amdgpu_ps void @raw_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
-; PREGFX10-LABEL: raw_ptr_buffer_load_v2i16:
-; PREGFX10: ; %bb.0: ; %main_body
-; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
-; PREGFX10-NEXT: s_mov_b32 m0, -1
-; PREGFX10-NEXT: s_waitcnt vmcnt(0)
-; PREGFX10-NEXT: ds_write_b32 v0, v1
-; PREGFX10-NEXT: s_endpgm
-;
; GFX10-LABEL: raw_ptr_buffer_load_v2i16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
@@ -999,14 +975,6 @@ main_body:
}
define amdgpu_ps void @raw_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
-; PREGFX10-LABEL: raw_ptr_buffer_load_v4i16:
-; PREGFX10: ; %bb.0: ; %main_body
-; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
-; PREGFX10-NEXT: s_mov_b32 m0, -1
-; PREGFX10-NEXT: s_waitcnt vmcnt(0)
-; PREGFX10-NEXT: ds_write_b64 v0, v[1:2]
-; PREGFX10-NEXT: s_endpgm
-;
; GFX10-LABEL: raw_ptr_buffer_load_v4i16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
index 94fbd0137a509e..3fc42e7558b6de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
@@ -339,13 +339,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
-; CHECK-LABEL: struct_buffer_load_v2f16:
-; CHECK: ; %bb.0: ; %main_body
-; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b32 v0, v1
-; CHECK-NEXT: s_endpgm
+; VI-LABEL: struct_buffer_load_v2f16:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ds_write_b32 v0, v1
+; VI-NEXT: s_endpgm
main_body:
%val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x half> %val, ptr addrspace(3) %ptr
@@ -358,13 +358,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
-; CHECK-LABEL: struct_buffer_load_v4f16:
-; CHECK: ; %bb.0: ; %main_body
-; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
-; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b64 v0, v[1:2]
-; CHECK-NEXT: s_endpgm
+; VI-LABEL: struct_buffer_load_v4f16:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ds_write_b64 v0, v[1:2]
+; VI-NEXT: s_endpgm
main_body:
%val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x half> %val, ptr addrspace(3) %ptr
@@ -396,13 +396,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
-; CHECK-LABEL: struct_buffer_load_v2i16:
-; CHECK: ; %bb.0: ; %main_body
-; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b32 v0, v1
-; CHECK-NEXT: s_endpgm
+; VI-LABEL: struct_buffer_load_v2i16:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ds_write_b32 v0, v1
+; VI-NEXT: s_endpgm
main_body:
%val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x i16> %val, ptr addrspace(3) %ptr
@@ -415,13 +415,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
-; CHECK-LABEL: struct_buffer_load_v4i16:
-; CHECK: ; %bb.0: ; %main_body
-; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
-; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b64 v0, v[1:2]
-; CHECK-NEXT: s_endpgm
+; VI-LABEL: struct_buffer_load_v4i16:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ds_write_b64 v0, v[1:2]
+; VI-NEXT: s_endpgm
main_body:
%val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x i16> %val, ptr addrspace(3) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
index 71adf4b2aaeab6..01a5d759e1aa1f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
@@ -339,13 +339,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
-; CHECK-LABEL: struct_ptr_buffer_load_v2f16:
-; CHECK: ; %bb.0: ; %main_body
-; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b32 v0, v1
-; CHECK-NEXT: s_endpgm
+; VI-LABEL: struct_ptr_buffer_load_v2f16:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ds_write_b32 v0, v1
+; VI-NEXT: s_endpgm
main_body:
%val = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x half> %val, ptr addrspace(3) %ptr
@@ -358,13 +358,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
-; CHECK-LABEL: struct_ptr_buffer_load_v4f16:
-; CHECK: ; %bb.0: ; %main_body
-; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
-; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b64 v0, v[1:2]
-; CHECK-NEXT: s_endpgm
+; VI-LABEL: struct_ptr_buffer_load_v4f16:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ds_write_b64 v0, v[1:2]
+; VI-NEXT: s_endpgm
main_body:
%val = call <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x half> %val, ptr addrspace(3) %ptr
@@ -396,13 +396,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
-; CHECK-LABEL: struct_ptr_buffer_load_v2i16:
-; CHECK: ; %bb.0: ; %main_body
-; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b32 v0, v1
-; CHECK-NEXT: s_endpgm
+; VI-LABEL: struct_ptr_buffer_load_v2i16:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ds_write_b32 v0, v1
+; VI-NEXT: s_endpgm
main_body:
%val = call <2 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v2i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x i16> %val, ptr addrspace(3) %ptr
@@ -415,13 +415,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
-; CHECK-LABEL: struct_ptr_buffer_load_v4i16:
-; CHECK: ; %bb.0: ; %main_body
-; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
-; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b64 v0, v[1:2]
-; CHECK-NEXT: s_endpgm
+; VI-LABEL: struct_ptr_buffer_load_v4i16:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ds_write_b64 v0, v[1:2]
+; VI-NEXT: s_endpgm
main_body:
%val = call <4 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x i16> %val, ptr addrspace(3) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index 0755dcddd8f46e..6c962926b8283e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -691,8 +691,8 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 30, v0
-; SI-NEXT: v_and_b32_e32 v0, 2.0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 31, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -710,8 +710,8 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v0, 30, v0
-; VI-NEXT: v_and_b32_e32 v0, 2.0, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0
+; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
@@ -779,7 +779,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, 1, v0
+; SI-NEXT: v_bfe_u32 v0, v0, 0, 1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -797,7 +797,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, 1, v0
+; VI-NEXT: v_bfe_u32 v0, v0, 0, 1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 7a0450761e1f11..8eb17dde9204c7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6435,18 +6435,21 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
; SI-SDAG-LABEL: v_exp_fabs_v2f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v0|
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0
+; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1
+; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp_fabs_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 544c1de6c7bb77..f0a15d8431f5a3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6528,18 +6528,21 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
; SI-SDAG-LABEL: v_exp10_fabs_v2f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v0|
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0
+; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1
+; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp10_fabs_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 36e78975cdb015..6aefd4dbc2a00d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -2363,8 +2363,11 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
; SI-SDAG-LABEL: v_exp2_fabs_v2f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index d847af780acab3..00bd42b715ab29 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6696,18 +6696,21 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
; SI-SDAG-LABEL: v_log_fabs_v2f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v0|
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0
+; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1
+; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_log_fabs_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 3f060de9f6596d..4f01c5d40f0943 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6696,18 +6696,21 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
; SI-SDAG-LABEL: v_log10_fabs_v2f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v0|
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0
+; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1
+; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_log10_fabs_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 035b2439eff153..76389c92ee43a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -3141,8 +3141,11 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
; SI-SDAG-LABEL: v_log2_fabs_v2f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index c3e665fa8269a0..6be982ae3131f3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -23,8 +23,10 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; SI-NEXT: v_add_i32_e64 v1, s[4:5], v1, v5
; SI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc
-; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; SI-NEXT: v_add_i32_e32 v5, vcc, v2, v3
+; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 32
+; SI-NEXT: v_or_b32_e32 v2, v2, v5
; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -44,10 +46,12 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v9, v7
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v8, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
; GFX9-NEXT: v_add3_u32 v1, v1, v5, v7
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -67,8 +71,10 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; GFX10-NEXT: v_or_b32_e32 v2, v2, v4
; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -92,9 +98,12 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v9, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v4
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -122,9 +131,12 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 32, v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_or_b32_e32 v2, v2, v4
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -163,8 +175,10 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; SI-NEXT: v_sub_i32_e32 v0, vcc, v2, v0
; SI-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v1, v8, vcc
+; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_mov_b32_e32 v0, v4
@@ -196,10 +210,12 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, v2, v5
; GFX9-NEXT: v_subbrev_co_u32_e32 v7, vcc, 0, v4, vcc
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
; GFX9-NEXT: v_add3_u32 v1, v1, v6, v8
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v5
; GFX9-NEXT: v_mov_b32_e32 v5, v4
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -226,16 +242,18 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
-; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, v4
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo
+; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; GFX10-NEXT: v_mov_b32_e32 v5, v4
+; GFX10-NEXT: v_or_b32_e32 v2, v2, v9
+; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -265,18 +283,21 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
-; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v2, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
-; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
+; GFX11-NEXT: v_dual_cndmask_b32 v9, v2, v4 :: v_dual_cndmask_b32 v2, v5, v7
+; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; GFX11-NEXT: v_mov_b32_e32 v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v9
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -310,18 +331,21 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
-; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v2, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
-; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
+; GFX12-NEXT: v_dual_cndmask_b32 v9, v2, v4 :: v_dual_cndmask_b32 v2, v5, v7
+; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 32, v[2:3]
+; GFX12-NEXT: v_mov_b32_e32 v5, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_or_b32_e32 v2, v2, v9
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
@@ -351,9 +375,11 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, s1, v1
-; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; SI-NEXT: v_add_i32_e32 v4, vcc, s1, v1
+; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v3
+; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; SI-NEXT: v_or_b32_e32 v0, v0, v4
; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
; SI-NEXT: s_and_b64 s[0:1], vcc, exec
@@ -379,8 +405,10 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; GFX9-NEXT: s_addc_u32 s4, s5, s4
; GFX9-NEXT: s_addc_u32 s5, s10, 0
; GFX9-NEXT: s_mul_i32 s1, s1, s3
-; GFX9-NEXT: s_add_u32 s4, s4, s1
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s1, s4, s1
+; GFX9-NEXT: s_addc_u32 s4, 0, s5
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 32
+; GFX9-NEXT: s_or_b32 s4, s4, s1
; GFX9-NEXT: s_add_i32 s1, s8, s7
; GFX9-NEXT: s_add_i32 s1, s1, s6
; GFX9-NEXT: s_mul_i32 s0, s0, s2
@@ -398,24 +426,26 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_mul_i32 s7, s0, s3
; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3
+; GFX10-NEXT: s_mul_hi_u32 s6, s0, s3
; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT: s_mul_i32 s6, s1, s2
+; GFX10-NEXT: s_mul_i32 s5, s1, s2
; GFX10-NEXT: s_mul_hi_u32 s9, s1, s3
; GFX10-NEXT: s_mul_i32 s1, s1, s3
; GFX10-NEXT: s_add_u32 s3, s8, s7
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_add_u32 s3, s3, s6
-; GFX10-NEXT: s_addc_u32 s3, s5, s4
-; GFX10-NEXT: s_addc_u32 s5, s9, 0
-; GFX10-NEXT: s_add_u32 s4, s3, s1
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_add_i32 s1, s8, s7
-; GFX10-NEXT: s_mul_i32 s0, s0, s2
-; GFX10-NEXT: s_add_i32 s1, s1, s6
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX10-NEXT: s_cselect_b32 s0, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 0, s1
+; GFX10-NEXT: s_addc_u32 s6, 0, s6
+; GFX10-NEXT: s_add_u32 s3, s3, s5
+; GFX10-NEXT: s_addc_u32 s3, s6, s4
+; GFX10-NEXT: s_addc_u32 s4, s9, 0
+; GFX10-NEXT: s_add_u32 s3, s3, s1
+; GFX10-NEXT: s_addc_u32 s4, 0, s4
+; GFX10-NEXT: s_mul_i32 s2, s0, s2
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[4:5], 32
+; GFX10-NEXT: s_add_i32 s4, s8, s7
+; GFX10-NEXT: s_or_b32 s0, s0, s3
+; GFX10-NEXT: s_add_i32 s4, s4, s5
+; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10-NEXT: s_cselect_b32 s0, 0, s2
+; GFX10-NEXT: s_cselect_b32 s1, 0, s4
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
@@ -427,24 +457,26 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mul_i32 s7, s0, s3
; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3
+; GFX11-NEXT: s_mul_hi_u32 s6, s0, s3
; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT: s_mul_i32 s6, s1, s2
+; GFX11-NEXT: s_mul_i32 s5, s1, s2
; GFX11-NEXT: s_mul_hi_u32 s9, s1, s3
; GFX11-NEXT: s_mul_i32 s1, s1, s3
; GFX11-NEXT: s_add_u32 s3, s8, s7
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_add_u32 s3, s3, s6
-; GFX11-NEXT: s_addc_u32 s3, s5, s4
-; GFX11-NEXT: s_addc_u32 s5, s9, 0
-; GFX11-NEXT: s_add_u32 s4, s3, s1
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_add_i32 s1, s8, s7
-; GFX11-NEXT: s_mul_i32 s0, s0, s2
-; GFX11-NEXT: s_add_i32 s1, s1, s6
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX11-NEXT: s_cselect_b32 s0, 0, s0
-; GFX11-NEXT: s_cselect_b32 s1, 0, s1
+; GFX11-NEXT: s_addc_u32 s6, 0, s6
+; GFX11-NEXT: s_add_u32 s3, s3, s5
+; GFX11-NEXT: s_addc_u32 s3, s6, s4
+; GFX11-NEXT: s_addc_u32 s4, s9, 0
+; GFX11-NEXT: s_add_u32 s3, s3, s1
+; GFX11-NEXT: s_addc_u32 s4, 0, s4
+; GFX11-NEXT: s_mul_i32 s2, s0, s2
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[4:5], 32
+; GFX11-NEXT: s_add_i32 s4, s8, s7
+; GFX11-NEXT: s_or_b32 s0, s0, s3
+; GFX11-NEXT: s_add_i32 s4, s4, s5
+; GFX11-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, 0, s2
+; GFX11-NEXT: s_cselect_b32 s1, 0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
@@ -470,10 +502,15 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0
; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[8:9]
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s6, s5
+; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 32
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s6, s6, s4
+; GFX12-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX12-NEXT: s_cselect_b32 s0, 0, s0
; GFX12-NEXT: s_cselect_b32 s1, 0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_nop 0
@@ -527,8 +564,10 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; SI-NEXT: s_sub_u32 s0, s2, s0
; SI-NEXT: s_subb_u32 s4, s1, 0
; SI-NEXT: s_cmp_lt_i32 s3, 0
-; SI-NEXT: s_cselect_b32 s1, s4, s1
-; SI-NEXT: s_cselect_b32 s0, s0, s2
+; SI-NEXT: s_cselect_b32 s2, s0, s2
+; SI-NEXT: s_cselect_b32 s0, s4, s1
+; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; SI-NEXT: s_or_b32 s0, s0, s2
; SI-NEXT: v_cmp_ne_u64_e32 vcc, s[0:1], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
; SI-NEXT: s_and_b64 s[0:1], vcc, exec
@@ -561,14 +600,16 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX9-NEXT: s_cmp_lt_i32 s1, 0
; GFX9-NEXT: s_cselect_b32 s4, s9, s4
; GFX9-NEXT: s_cselect_b32 s1, s10, s5
-; GFX9-NEXT: s_sub_u32 s9, s4, s0
-; GFX9-NEXT: s_subb_u32 s5, s1, 0
+; GFX9-NEXT: s_sub_u32 s5, s4, s0
+; GFX9-NEXT: s_subb_u32 s9, s1, 0
; GFX9-NEXT: s_cmp_lt_i32 s3, 0
-; GFX9-NEXT: s_cselect_b32 s5, s5, s1
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
+; GFX9-NEXT: s_cselect_b32 s3, s5, s4
+; GFX9-NEXT: s_cselect_b32 s4, s9, s1
; GFX9-NEXT: s_add_i32 s1, s8, s7
; GFX9-NEXT: s_add_i32 s1, s1, s6
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 32
; GFX9-NEXT: s_ashr_i32 s6, s1, 31
+; GFX9-NEXT: s_or_b32 s4, s4, s3
; GFX9-NEXT: s_mov_b32 s7, s6
; GFX9-NEXT: s_mul_i32 s0, s0, s2
; GFX9-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
@@ -585,36 +626,38 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_mul_i32 s7, s0, s3
; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT: s_mul_i32 s6, s1, s2
+; GFX10-NEXT: s_mul_hi_u32 s6, s0, s3
+; GFX10-NEXT: s_mul_i32 s5, s1, s2
; GFX10-NEXT: s_add_u32 s11, s8, s7
; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
+; GFX10-NEXT: s_addc_u32 s6, 0, s6
; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3
-; GFX10-NEXT: s_add_u32 s11, s11, s6
+; GFX10-NEXT: s_add_u32 s11, s11, s5
; GFX10-NEXT: s_mul_i32 s10, s1, s3
-; GFX10-NEXT: s_addc_u32 s4, s5, s4
-; GFX10-NEXT: s_addc_u32 s5, s9, 0
+; GFX10-NEXT: s_addc_u32 s4, s6, s4
+; GFX10-NEXT: s_addc_u32 s6, s9, 0
; GFX10-NEXT: s_add_u32 s4, s4, s10
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
+; GFX10-NEXT: s_addc_u32 s6, 0, s6
; GFX10-NEXT: s_sub_u32 s9, s4, s2
-; GFX10-NEXT: s_subb_u32 s10, s5, 0
+; GFX10-NEXT: s_subb_u32 s10, s6, 0
; GFX10-NEXT: s_cmp_lt_i32 s1, 0
; GFX10-NEXT: s_cselect_b32 s1, s9, s4
-; GFX10-NEXT: s_cselect_b32 s4, s10, s5
-; GFX10-NEXT: s_sub_u32 s9, s1, s0
-; GFX10-NEXT: s_subb_u32 s5, s4, 0
+; GFX10-NEXT: s_cselect_b32 s4, s10, s6
+; GFX10-NEXT: s_sub_u32 s6, s1, s0
+; GFX10-NEXT: s_subb_u32 s9, s4, 0
; GFX10-NEXT: s_cmp_lt_i32 s3, 0
; GFX10-NEXT: s_mul_i32 s0, s0, s2
-; GFX10-NEXT: s_cselect_b32 s5, s5, s4
-; GFX10-NEXT: s_cselect_b32 s4, s9, s1
-; GFX10-NEXT: s_add_i32 s1, s8, s7
-; GFX10-NEXT: s_add_i32 s1, s1, s6
-; GFX10-NEXT: s_ashr_i32 s6, s1, 31
+; GFX10-NEXT: s_cselect_b32 s1, s6, s1
+; GFX10-NEXT: s_cselect_b32 s4, s9, s4
+; GFX10-NEXT: s_add_i32 s3, s8, s7
+; GFX10-NEXT: s_add_i32 s3, s3, s5
+; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 32
+; GFX10-NEXT: s_ashr_i32 s6, s3, 31
+; GFX10-NEXT: s_or_b32 s4, s4, s1
; GFX10-NEXT: s_mov_b32 s7, s6
; GFX10-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
; GFX10-NEXT: s_cselect_b32 s0, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 0, s1
+; GFX10-NEXT: s_cselect_b32 s1, 0, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
@@ -626,38 +669,39 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mul_i32 s7, s0, s3
; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX11-NEXT: s_mul_i32 s6, s1, s2
+; GFX11-NEXT: s_mul_hi_u32 s6, s0, s3
+; GFX11-NEXT: s_mul_i32 s5, s1, s2
; GFX11-NEXT: s_add_u32 s11, s8, s7
; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
+; GFX11-NEXT: s_addc_u32 s6, 0, s6
; GFX11-NEXT: s_mul_hi_i32 s9, s1, s3
-; GFX11-NEXT: s_add_u32 s11, s11, s6
+; GFX11-NEXT: s_add_u32 s11, s11, s5
; GFX11-NEXT: s_mul_i32 s10, s1, s3
-; GFX11-NEXT: s_addc_u32 s4, s5, s4
-; GFX11-NEXT: s_addc_u32 s5, s9, 0
+; GFX11-NEXT: s_addc_u32 s4, s6, s4
+; GFX11-NEXT: s_addc_u32 s6, s9, 0
; GFX11-NEXT: s_add_u32 s4, s4, s10
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
+; GFX11-NEXT: s_addc_u32 s6, 0, s6
; GFX11-NEXT: s_sub_u32 s9, s4, s2
-; GFX11-NEXT: s_subb_u32 s10, s5, 0
+; GFX11-NEXT: s_subb_u32 s10, s6, 0
; GFX11-NEXT: s_cmp_lt_i32 s1, 0
; GFX11-NEXT: s_cselect_b32 s1, s9, s4
-; GFX11-NEXT: s_cselect_b32 s4, s10, s5
-; GFX11-NEXT: s_sub_u32 s9, s1, s0
-; GFX11-NEXT: s_subb_u32 s5, s4, 0
+; GFX11-NEXT: s_cselect_b32 s4, s10, s6
+; GFX11-NEXT: s_sub_u32 s6, s1, s0
+; GFX11-NEXT: s_subb_u32 s9, s4, 0
; GFX11-NEXT: s_cmp_lt_i32 s3, 0
; GFX11-NEXT: s_mul_i32 s0, s0, s2
-; GFX11-NEXT: s_cselect_b32 s5, s5, s4
-; GFX11-NEXT: s_cselect_b32 s4, s9, s1
-; GFX11-NEXT: s_add_i32 s1, s8, s7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s1, s1, s6
-; GFX11-NEXT: s_ashr_i32 s6, s1, 31
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cselect_b32 s1, s6, s1
+; GFX11-NEXT: s_cselect_b32 s4, s9, s4
+; GFX11-NEXT: s_add_i32 s3, s8, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s3, s3, s5
+; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], 32
+; GFX11-NEXT: s_ashr_i32 s6, s3, 31
+; GFX11-NEXT: s_or_b32 s4, s4, s1
; GFX11-NEXT: s_mov_b32 s7, s6
; GFX11-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
; GFX11-NEXT: s_cselect_b32 s0, 0, s0
-; GFX11-NEXT: s_cselect_b32 s1, 0, s1
+; GFX11-NEXT: s_cselect_b32 s1, 0, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
@@ -692,15 +736,16 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX12-NEXT: s_cmp_lt_i32 s3, 0
; GFX12-NEXT: s_sub_nc_u64 s[4:5], s[6:7], s[4:5]
; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_cselect_b32 s3, s5, s7
-; GFX12-NEXT: s_cselect_b32 s2, s4, s6
+; GFX12-NEXT: s_cselect_b32 s2, s5, s7
+; GFX12-NEXT: s_cselect_b32 s6, s4, s6
; GFX12-NEXT: s_ashr_i32 s4, s1, 31
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b64 s[2:3], s[2:3], 32
; GFX12-NEXT: s_mov_b32 s5, s4
+; GFX12-NEXT: s_or_b32 s2, s2, s6
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], s[4:5]
; GFX12-NEXT: s_cselect_b32 s0, 0, s0
; GFX12-NEXT: s_cselect_b32 s1, 0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_nop 0
@@ -790,37 +835,34 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
; SI-LABEL: umulo_i64_v_4:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: v_lshl_b64 v[4:5], v[0:1], 2
-; SI-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
+; SI-NEXT: v_lshl_b64 v[5:6], v[0:1], 2
+; SI-NEXT: v_alignbit_b32 v4, v1, v0, 30
+; SI-NEXT: v_lshr_b64 v[2:3], v[5:6], 2
+; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v4
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v0, v5
+; SI-NEXT: v_mov_b32_e32 v1, v4
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: umulo_i64_v_4:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1
-; GFX9-NEXT: v_mov_b32_e32 v6, v0
; GFX9-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
; GFX9-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[5:6], 2, v[4:5]
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: umulo_i64_v_4:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[5:6], 2, v[4:5]
+; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
@@ -829,11 +871,12 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
; GFX11-LABEL: umulo_i64_v_4:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b64 v[5:6], 2, v[4:5]
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -845,11 +888,12 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1
; GFX12-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[0:1]
; GFX12-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b64 v[5:6], 2, v[4:5]
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX12-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index f9ff7609755a93..2de46195304f1e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -377,11 +377,12 @@ entry:
define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) {
; SI-LABEL: local_size_x_known_bits:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dword s4, s[0:1], 0x6
+; SI-NEXT: s_load_dword s2, s[0:1], 0x6
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_u32 s4, s2, 0x100000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -393,20 +394,23 @@ define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) {
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_bfe_u32 s4, s4, 0x100000
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: local_size_x_known_bits:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT * T1.X, KC0[1].Z, literal.y,
-; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT: LSHL * T0.W, KC0[1].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
entry:
%size = call i32 @llvm.r600.read.local.size.x() #0
%shl = shl i32 %size, 16
@@ -424,11 +428,12 @@ entry:
define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) {
; SI-LABEL: local_size_y_known_bits:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dword s4, s[0:1], 0x7
+; SI-NEXT: s_load_dword s2, s[0:1], 0x7
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_u32 s4, s2, 0x100000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -440,20 +445,23 @@ define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) {
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_bfe_u32 s4, s4, 0x100000
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: local_size_y_known_bits:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT * T1.X, KC0[1].W, literal.y,
-; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT: LSHL * T0.W, KC0[1].W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
entry:
%size = call i32 @llvm.r600.read.local.size.y() #0
%shl = shl i32 %size, 16
@@ -471,11 +479,12 @@ entry:
define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) {
; SI-LABEL: local_size_z_known_bits:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dword s4, s[0:1], 0x8
+; SI-NEXT: s_load_dword s2, s[0:1], 0x8
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_u32 s4, s2, 0x100000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -487,20 +496,23 @@ define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) {
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_bfe_u32 s4, s4, 0x100000
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: local_size_z_known_bits:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT * T1.X, KC0[2].X, literal.y,
-; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT: LSHL * T0.W, KC0[2].X, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
entry:
%size = call i32 @llvm.r600.read.local.size.z() #0
%shl = shl i32 %size, 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index 48abc49c41ae0a..9470d4aa46900d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -1246,7 +1246,9 @@ define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
; GFX678: ; %bb.0:
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_cselect_b32 s34, 0xa50, 10
+; GFX678-NEXT: s_cselect_b32 s34, 1, 3
+; GFX678-NEXT: s_lshl_b32 s34, s34, 2
+; GFX678-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
@@ -1254,7 +1256,9 @@ define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_cselect_b32 s34, 0xa50, 10
+; GFX9-NEXT: s_cselect_b32 s34, 1, 3
+; GFX9-NEXT: s_lshl_b32 s34, s34, 2
+; GFX9-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1262,7 +1266,9 @@ define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s34, 0xa50, 10
+; GFX10-NEXT: s_cselect_b32 s34, 1, 3
+; GFX10-NEXT: s_lshl_b32 s34, s34, 2
+; GFX10-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1270,7 +1276,9 @@ define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_cselect_b32 s0, 0xa50, 10
+; GFX11-NEXT: s_cselect_b32 s0, 1, 3
+; GFX11-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-NEXT: s_lshr_b32 s0, 0xa50f, s0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
@@ -1280,22 +1288,48 @@ define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
}
define void @v_set_rounding_select_1_3(i32 %cond) {
-; GFX678-LABEL: v_set_rounding_select_1_3:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: v_mov_b32_e32 v1, 0xa50
-; GFX678-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX678-NEXT: v_cndmask_b32_e32 v0, 10, v1, vcc
-; GFX678-NEXT: v_readfirstlane_b32 s4, v0
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX678-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: v_set_rounding_select_1_3:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX6-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_set_rounding_select_1_3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0
+; GFX7-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_set_rounding_select_1_3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_mov_b32 s4, 0xa50f
+; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s4
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_set_rounding_select_1_3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xa50
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 10, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s4, 0xa50f
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s4
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1304,7 +1338,9 @@ define void @v_set_rounding_select_1_3(i32 %cond) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 10, 0xa50, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -1313,7 +1349,9 @@ define void @v_set_rounding_select_1_3(i32 %cond) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 10, 0xa50, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1330,7 +1368,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
; GFX6-NEXT: s_cmp_eq_u32 s4, 0
; GFX6-NEXT: s_cselect_b64 s[34:35], -1, 0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0
; GFX6-NEXT: v_readfirstlane_b32 s34, v0
; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
@@ -1342,7 +1381,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
; GFX7-NEXT: s_cmp_eq_u32 s4, 0
; GFX7-NEXT: s_cselect_b64 s[34:35], -1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0
; GFX7-NEXT: v_readfirstlane_b32 s34, v0
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
@@ -1354,7 +1394,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
; GFX8-NEXT: s_cmp_eq_u32 s4, 0
; GFX8-NEXT: s_cselect_b64 s[34:35], -1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_mov_b32 s34, 0xa50f
; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s34
; GFX8-NEXT: v_readfirstlane_b32 s34, v0
@@ -1367,7 +1408,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s34, 0xa50f
; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s34
; GFX9-NEXT: v_readfirstlane_b32 s34, v0
@@ -1380,7 +1422,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s34, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f
; GFX10-NEXT: v_readfirstlane_b32 s34, v0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
@@ -1392,7 +1435,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
@@ -1408,8 +1452,9 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX678: ; %bb.0:
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_movk_i32 s34, 0xa5
-; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa50
+; GFX678-NEXT: s_cselect_b32 s34, 2, 1
+; GFX678-NEXT: s_lshl_b32 s34, s34, 2
+; GFX678-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
@@ -1417,8 +1462,9 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_movk_i32 s34, 0xa5
-; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa50
+; GFX9-NEXT: s_cselect_b32 s34, 2, 1
+; GFX9-NEXT: s_lshl_b32 s34, s34, 2
+; GFX9-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1426,8 +1472,9 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_movk_i32 s34, 0xa5
-; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa50
+; GFX10-NEXT: s_cselect_b32 s34, 2, 1
+; GFX10-NEXT: s_lshl_b32 s34, s34, 2
+; GFX10-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1435,8 +1482,9 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_movk_i32 s0, 0xa5
-; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa50
+; GFX11-NEXT: s_cselect_b32 s0, 2, 1
+; GFX11-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-NEXT: s_lshr_b32 s0, 0xa50f, s0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
@@ -1450,8 +1498,9 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX678: ; %bb.0:
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_movk_i32 s34, 0xa50
-; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa5
+; GFX678-NEXT: s_cselect_b32 s34, 1, 2
+; GFX678-NEXT: s_lshl_b32 s34, s34, 2
+; GFX678-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
@@ -1459,8 +1508,9 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_movk_i32 s34, 0xa50
-; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa5
+; GFX9-NEXT: s_cselect_b32 s34, 1, 2
+; GFX9-NEXT: s_lshl_b32 s34, s34, 2
+; GFX9-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1468,8 +1518,9 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_movk_i32 s34, 0xa50
-; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa5
+; GFX10-NEXT: s_cselect_b32 s34, 1, 2
+; GFX10-NEXT: s_lshl_b32 s34, s34, 2
+; GFX10-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1477,8 +1528,9 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_movk_i32 s0, 0xa50
-; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa5
+; GFX11-NEXT: s_cselect_b32 s0, 1, 2
+; GFX11-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-NEXT: s_lshr_b32 s0, 0xa50f, s0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
@@ -1492,7 +1544,9 @@ define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
; GFX678: ; %bb.0:
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_cselect_b32 s34, 10, 0xa50f
+; GFX678-NEXT: s_cselect_b32 s34, 3, 0
+; GFX678-NEXT: s_lshl_b32 s34, s34, 2
+; GFX678-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
@@ -1500,7 +1554,9 @@ define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_cselect_b32 s34, 10, 0xa50f
+; GFX9-NEXT: s_cselect_b32 s34, 3, 0
+; GFX9-NEXT: s_lshl_b32 s34, s34, 2
+; GFX9-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1508,7 +1564,9 @@ define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s34, 10, 0xa50f
+; GFX10-NEXT: s_cselect_b32 s34, 3, 0
+; GFX10-NEXT: s_lshl_b32 s34, s34, 2
+; GFX10-NEXT: s_lshr_b32 s34, 0xa50f, s34
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1516,7 +1574,9 @@ define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_cselect_b32 s0, 10, 0xa50f
+; GFX11-NEXT: s_cselect_b32 s0, 3, 0
+; GFX11-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-NEXT: s_lshr_b32 s0, 0xa50f, s0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 88b18232ef9c87..4a9a2f2d44cecc 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -96,6 +96,10 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
@@ -109,6 +113,10 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -116,7 +124,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -125,13 +133,17 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.X, 1,
+; EG-NEXT: LSHL T0.W, PV.W, 1,
+; EG-NEXT: AND_INT * T1.W, T0.X, 1,
+; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T0.X, T1.W, PV.W,
-; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
@@ -145,6 +157,11 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b16 v2, 1, v2
+; GFX12-NEXT: v_or_b32_e32 v1, v1, v2
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -169,6 +186,13 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 2, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
@@ -182,6 +206,14 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 2, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v4
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, 2, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -189,7 +221,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -198,12 +230,20 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T0.X, 1, 1,
+; EG-NEXT: LSHR T0.Z, T0.X, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, 1,
+; EG-NEXT: AND_INT * T1.W, T0.X, 1,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT T0.W, PS, PV.W,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 3(4.203895e-45), 2(2.802597e-45)
+; EG-NEXT: OR_INT T0.W, PV.W, PS,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T0.X, T0.X, PV.W,
-; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
@@ -217,6 +257,17 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_and_b32_e32 v3, 1, v1
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, v1
+; GFX12-NEXT: v_lshrrev_b16 v1, 2, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-NEXT: v_lshlrev_b16 v1, 2, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b16 v2, 1, v2
+; GFX12-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_or_b32_e32 v1, v2, v1
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -241,6 +292,16 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 3, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX6-NEXT: v_bfe_u32 v3, v0, 1, 1
+; GFX6-NEXT: v_bfe_u32 v0, v0, 2, 1
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
@@ -254,6 +315,18 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v2
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 3, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v5
+; GFX8-NEXT: v_lshlrev_b16_e32 v4, 2, v4
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, 3, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -261,7 +334,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 24, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -270,13 +343,26 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
-; EG-NEXT: 3(4.203895e-45), 15(2.101948e-44)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T0.X, 1, 1,
+; EG-NEXT: BFE_UINT T0.Z, T0.X, literal.x, 1,
+; EG-NEXT: LSHL T0.W, PV.W, 1,
+; EG-NEXT: AND_INT * T1.W, T0.X, 1,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.Z, T0.X, literal.x,
+; EG-NEXT: OR_INT T0.W, PS, PV.W,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 3(4.203895e-45), 2(2.802597e-45)
+; EG-NEXT: OR_INT T0.W, PV.W, PS,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T0.X, T1.W, PV.W,
-; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
+; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
+; EG-NEXT: 15(2.101948e-44), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
@@ -290,6 +376,22 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, v1
+; GFX12-NEXT: v_lshrrev_b16 v3, 2, v1
+; GFX12-NEXT: v_and_b32_e32 v4, 1, v1
+; GFX12-NEXT: v_lshrrev_b16 v1, 3, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_lshlrev_b16 v1, 3, v1
+; GFX12-NEXT: v_lshlrev_b16 v2, 1, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_lshlrev_b16 v3, 2, v3
+; GFX12-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX12-NEXT: v_or_b32_e32 v1, v2, v1
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1358,20 +1460,21 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s8, s2
; GFX6-NEXT: s_mov_b32 s9, s3
-; GFX6-NEXT: buffer_load_ubyte v4, off, s[8:11], 0
+; GFX6-NEXT: buffer_load_ubyte v1, off, s[8:11], 0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_bfe_u32 v3, v4, 3, 1
-; GFX6-NEXT: v_bfe_u32 v1, v4, 1, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 7, v4
-; GFX6-NEXT: v_bfe_u32 v5, v4, 5, 1
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v4
-; GFX6-NEXT: v_bfe_u32 v2, v4, 2, 1
-; GFX6-NEXT: v_bfe_u32 v6, v4, 6, 1
-; GFX6-NEXT: v_bfe_u32 v4, v4, 4, 1
-; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v8, 4, v1
+; GFX6-NEXT: v_bfe_u32 v7, v1, 3, 1
+; GFX6-NEXT: v_bfe_u32 v5, v1, 1, 1
+; GFX6-NEXT: v_bfe_u32 v2, v1, 6, 1
+; GFX6-NEXT: v_bfe_u32 v0, v1, 4, 1
+; GFX6-NEXT: v_and_b32_e32 v4, 1, v1
+; GFX6-NEXT: v_bfe_u32 v6, v1, 2, 1
+; GFX6-NEXT: v_bfe_u32 v1, v8, 1, 1
+; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_zextload_v8i1_to_v8i32:
@@ -1388,23 +1491,20 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v10, s2
; GFX8-NEXT: v_mov_b32_e32 v8, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, 5, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v12, 3, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 4, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v4, 6, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v6, 2, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, 4, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 6, v1
+; GFX8-NEXT: v_and_b32_e32 v6, 2, v1
+; GFX8-NEXT: v_and_b32_e32 v12, 2, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 8, v1
; GFX8-NEXT: v_and_b32_e32 v0, 1, v1
; GFX8-NEXT: v_lshrrev_b16_e32 v7, 7, v1
-; GFX8-NEXT: v_and_b32_e32 v13, 1, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v5
-; GFX8-NEXT: v_and_b32_e32 v5, 1, v12
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v6
-; GFX8-NEXT: v_and_b32_e32 v6, 1, v4
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v3
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v13
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v6
+; GFX8-NEXT: v_and_b32_e32 v6, 1, v5
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v12
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -1413,62 +1513,63 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1
+; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T5.X, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_UINT * T6.W, T5.X, literal.x, 1,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T6.Z, T5.X, literal.x, 1,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T6.Y, T5.X, 1, 1,
-; EG-NEXT: BFE_UINT * T5.W, T5.X, literal.x, 1,
-; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T6.X, T5.X, 1,
-; EG-NEXT: BFE_UINT T5.Z, T5.X, literal.x, 1,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.y,
-; EG-NEXT: 6(8.407791e-45), 2(2.802597e-45)
-; EG-NEXT: BFE_UINT * T5.Y, T5.X, literal.x, 1,
-; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T5.X, T5.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T6.Z, T5.X, literal.x, 1,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 4(5.605194e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR * T8.X, PV.W, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: LSHR T7.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T6.Y, T5.X, 1, 1,
+; EG-NEXT: BFE_UINT T8.Z, T5.X, literal.y, 1,
+; EG-NEXT: AND_INT * T6.X, T5.X, 1,
+; EG-NEXT: 2(2.802597e-45), 6(8.407791e-45)
+; EG-NEXT: BFE_UINT T8.Y, T5.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T5.X, literal.y,
+; EG-NEXT: 5(7.006492e-45), 4(5.605194e-45)
+; EG-NEXT: BFE_UINT T8.X, T5.X, literal.x, 1,
+; EG-NEXT: AND_INT T1.W, T5.X, literal.y,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T8.W, PS, literal.y,
+; EG-NEXT: LSHR * T6.W, PV.W, literal.y,
+; EG-NEXT: 2(2.802597e-45), 3(4.203895e-45)
;
; GFX12-LABEL: constant_zextload_v8i1_to_v8i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v8, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v2, 5, v0
-; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0
-; GFX12-NEXT: v_lshrrev_b16 v6, 3, v0
-; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0
-; GFX12-NEXT: v_lshrrev_b16 v3, 6, v0
-; GFX12-NEXT: v_and_b32_e32 v9, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v4, 7, v0
-; GFX12-NEXT: v_lshrrev_b16 v7, 2, v0
-; GFX12-NEXT: v_and_b32_e32 v10, 1, v5
-; GFX12-NEXT: v_and_b32_e32 v5, 1, v6
-; GFX12-NEXT: v_and_b32_e32 v6, 1, v3
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: v_and_b32_e32 v2, 1, v7
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX12-NEXT: v_lshrrev_b16 v4, 4, v1
+; GFX12-NEXT: v_and_b32_e32 v0, 2, v1
+; GFX12-NEXT: v_and_b32_e32 v2, 8, v1
+; GFX12-NEXT: v_lshrrev_b16 v5, 2, v1
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, v1
+; GFX12-NEXT: v_and_b32_e32 v7, 2, v4
+; GFX12-NEXT: v_lshrrev_b16 v9, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v10, 3, v2
+; GFX12-NEXT: v_lshrrev_b16 v3, 7, v1
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v6
+; GFX12-NEXT: v_lshrrev_b16 v11, 1, v7
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v4
; GFX12-NEXT: v_and_b32_e32 v4, 1, v1
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v10
+; GFX12-NEXT: v_and_b32_e32 v6, 1, v5
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v9
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v10
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v11
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1621,30 +1722,34 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s8, s6
; GFX6-NEXT: s_mov_b32 s9, s7
-; GFX6-NEXT: buffer_load_ushort v12, off, s[8:11], 0
+; GFX6-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GFX6-NEXT: s_mov_b32 s0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_bfe_u32 v3, v12, 3, 1
-; GFX6-NEXT: v_bfe_u32 v1, v12, 1, 1
-; GFX6-NEXT: v_bfe_u32 v7, v12, 7, 1
-; GFX6-NEXT: v_bfe_u32 v5, v12, 5, 1
-; GFX6-NEXT: v_bfe_u32 v11, v12, 11, 1
-; GFX6-NEXT: v_bfe_u32 v9, v12, 9, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v15, 15, v12
-; GFX6-NEXT: v_bfe_u32 v13, v12, 13, 1
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v12
-; GFX6-NEXT: v_bfe_u32 v2, v12, 2, 1
-; GFX6-NEXT: v_bfe_u32 v6, v12, 6, 1
-; GFX6-NEXT: v_bfe_u32 v4, v12, 4, 1
-; GFX6-NEXT: v_bfe_u32 v10, v12, 10, 1
-; GFX6-NEXT: v_bfe_u32 v8, v12, 8, 1
-; GFX6-NEXT: v_bfe_u32 v14, v12, 14, 1
-; GFX6-NEXT: v_bfe_u32 v12, v12, 12, 1
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
-; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NEXT: v_bfe_u32 v3, v0, 7, 1
+; GFX6-NEXT: v_bfe_u32 v1, v0, 4, 4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v7, 15, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v16, 12, v0
+; GFX6-NEXT: v_bfe_u32 v11, v0, 3, 1
+; GFX6-NEXT: v_bfe_u32 v9, v0, 1, 1
+; GFX6-NEXT: v_and_b32_e32 v17, 0xf0, v0
+; GFX6-NEXT: v_bfe_u32 v2, v0, 6, 1
+; GFX6-NEXT: v_bfe_u32 v14, v0, 10, 1
+; GFX6-NEXT: v_bfe_u32 v12, v0, 8, 1
+; GFX6-NEXT: v_bfe_u32 v6, v0, 14, 1
+; GFX6-NEXT: v_bfe_u32 v4, v0, 12, 1
+; GFX6-NEXT: v_and_b32_e32 v8, 1, v0
+; GFX6-NEXT: v_bfe_u32 v10, v0, 2, 1
+; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 1
+; GFX6-NEXT: v_bfe_u32 v15, v5, 3, 1
+; GFX6-NEXT: v_bfe_u32 v13, v5, 1, 1
+; GFX6-NEXT: v_bfe_u32 v5, v16, 1, 1
+; GFX6-NEXT: v_bfe_u32 v0, v17, 4, 1
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_zextload_v16i1_to_v16i32:
@@ -1659,162 +1764,159 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v19, s3
; GFX8-NEXT: v_mov_b32_e32 v18, s2
; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v0, 1
; GFX8-NEXT: v_mov_b32_e32 v17, s1
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v16, s0
; GFX8-NEXT: s_add_u32 s0, s0, 16
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_mov_b32_e32 v23, s1
; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v8, 3, v1
-; GFX8-NEXT: v_and_b32_e32 v12, 1, v8
-; GFX8-NEXT: v_and_b32_e32 v8, 1, v3
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v3
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v7, 14, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v11, 2, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT: v_lshrrev_b16_e32 v5, 13, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v9, 9, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, 10, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v4, 4, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v1
-; GFX8-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v7
-; GFX8-NEXT: v_and_b32_e32 v14, 1, v11
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v12
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v3
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 11, v1
-; GFX8-NEXT: v_and_b32_e32 v12, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v11, 15, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, 5, v1
-; GFX8-NEXT: v_and_b32_e32 v24, 1, v5
-; GFX8-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v9
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v24
-; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v1
+; GFX8-NEXT: v_and_b32_e32 v9, 2, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 8, v1
+; GFX8-NEXT: v_and_b32_e32 v11, 0xf0, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, 12, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 14, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, 8, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v8, 10, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v12, 6, v1
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, 15, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 3, v3
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v9
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
-; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[0:3]
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
-; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; GFX8-NEXT: v_lshrrev_b16_e32 v9, 4, v11
+; GFX8-NEXT: v_and_b32_e32 v13, 2, v4
+; GFX8-NEXT: v_and_b32_e32 v24, 2, v9
+; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT: v_and_b32_e32 v22, 2, v6
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_and_b32_e32 v23, 8, v6
+; GFX8-NEXT: v_and_b32_e32 v10, 1, v8
+; GFX8-NEXT: v_and_b32_e32 v8, 1, v6
+; GFX8-NEXT: v_and_b32_e32 v14, 1, v12
+; GFX8-NEXT: v_lshrrev_b16_e32 v15, 7, v11
+; GFX8-NEXT: v_and_b32_e32 v6, 1, v5
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT: v_and_b32_e32 v12, 1, v9
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v13
+; GFX8-NEXT: v_lshrrev_b16_e32 v13, 1, v24
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_lshrrev_b16_e32 v11, 3, v23
+; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v22
+; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7]
+; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i1_to_v16i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 36, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 1
+; EG-NEXT: ALU 40, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T7.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_16 T7.X, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: BFE_UINT * T8.W, T7.X, literal.x, 1,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T8.Z, T7.X, literal.x, 1,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T8.Y, T7.X, 1, 1,
-; EG-NEXT: BFE_UINT * T9.W, T7.X, literal.x, 1,
-; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T8.X, T7.X, 1,
-; EG-NEXT: BFE_UINT T9.Z, T7.X, literal.x, 1,
-; EG-NEXT: LSHR * T10.X, KC0[2].Y, literal.y,
-; EG-NEXT: 6(8.407791e-45), 2(2.802597e-45)
-; EG-NEXT: BFE_UINT T9.Y, T7.X, literal.x, 1,
-; EG-NEXT: BFE_UINT * T11.W, T7.X, literal.y, 1,
-; EG-NEXT: 5(7.006492e-45), 11(1.541428e-44)
-; EG-NEXT: BFE_UINT T9.X, T7.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T11.Z, T7.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 4(5.605194e-45), 10(1.401298e-44)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T12.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T11.Y, T7.X, literal.y, 1,
-; EG-NEXT: LSHR * T7.W, T7.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44)
+; EG-NEXT: LSHR T8.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR * T9.W, T7.X, literal.z,
+; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T11.X, T7.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T7.Z, T7.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 8(1.121039e-44), 14(1.961818e-44)
-; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T13.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT * T7.Y, T7.X, literal.y, 1,
-; EG-NEXT: 2(2.802597e-45), 13(1.821688e-44)
-; EG-NEXT: BFE_UINT T7.X, T7.X, literal.x, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 12(1.681558e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR * T14.X, PV.W, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T10.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T9.Z, T7.X, literal.y, 1,
+; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 14(1.961818e-44)
+; EG-NEXT: BFE_UINT T9.Y, T7.X, literal.x, 1,
+; EG-NEXT: BFE_UINT * T12.Z, T7.X, literal.y, 1,
+; EG-NEXT: 13(1.821688e-44), 2(2.802597e-45)
+; EG-NEXT: BFE_UINT T9.X, T7.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T12.Y, T7.X, 1, 1,
+; EG-NEXT: BFE_UINT T13.Z, T7.X, literal.y, 1,
+; EG-NEXT: AND_INT * T12.X, T7.X, 1,
+; EG-NEXT: 12(1.681558e-44), 6(8.407791e-45)
+; EG-NEXT: BFE_UINT T13.Y, T7.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T14.Z, T7.X, literal.y, 1,
+; EG-NEXT: LSHR * T0.W, T7.X, literal.z,
+; EG-NEXT: 5(7.006492e-45), 10(1.401298e-44)
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T13.X, T7.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T14.Y, T7.X, literal.y, 1,
+; EG-NEXT: AND_INT T0.W, PV.W, literal.z,
+; EG-NEXT: LSHR * T1.W, T7.X, literal.z,
+; EG-NEXT: 4(5.605194e-45), 9(1.261169e-44)
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T14.X, T7.X, literal.x, 1,
+; EG-NEXT: AND_INT T0.Y, T7.X, literal.x,
+; EG-NEXT: AND_INT T0.Z, PS, literal.x,
+; EG-NEXT: ADD_INT T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR * T13.W, PV.W, literal.z,
+; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T7.X, PV.W, literal.x,
+; EG-NEXT: LSHR T14.W, PV.Z, literal.y,
+; EG-NEXT: LSHR * T12.W, PV.Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 3(4.203895e-45)
;
; GFX12-LABEL: constant_zextload_v16i1_to_v16i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v16, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v2, 13, v0
-; GFX12-NEXT: v_lshrrev_b16 v13, 1, v0
-; GFX12-NEXT: v_lshrrev_b16 v15, 3, v0
-; GFX12-NEXT: v_lshrrev_b16 v4, 9, v0
-; GFX12-NEXT: v_lshrrev_b16 v6, 11, v0
-; GFX12-NEXT: v_and_b32_e32 v17, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v10, 5, v0
-; GFX12-NEXT: v_lshrrev_b16 v12, 7, v0
-; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0
-; GFX12-NEXT: v_lshrrev_b16 v3, 14, v0
-; GFX12-NEXT: v_lshrrev_b16 v5, 15, v0
-; GFX12-NEXT: v_lshrrev_b16 v14, 2, v0
-; GFX12-NEXT: v_and_b32_e32 v22, 1, v13
-; GFX12-NEXT: v_and_b32_e32 v13, 1, v15
-; GFX12-NEXT: v_lshrrev_b16 v7, 8, v0
-; GFX12-NEXT: v_lshrrev_b16 v8, 10, v0
-; GFX12-NEXT: v_lshrrev_b16 v9, 4, v0
-; GFX12-NEXT: v_lshrrev_b16 v11, 6, v0
-; GFX12-NEXT: v_and_b32_e32 v18, 1, v4
-; GFX12-NEXT: v_and_b32_e32 v19, 1, v6
-; GFX12-NEXT: v_and_b32_e32 v20, 1, v10
-; GFX12-NEXT: v_and_b32_e32 v21, 1, v12
-; GFX12-NEXT: v_and_b32_e32 v2, 1, v14
-; GFX12-NEXT: v_and_b32_e32 v15, 0xffff, v5
-; GFX12-NEXT: v_and_b32_e32 v14, 1, v3
+; GFX12-NEXT: v_and_b32_e32 v8, 0xf0, v1
+; GFX12-NEXT: v_and_b32_e32 v0, 2, v1
+; GFX12-NEXT: v_lshrrev_b16 v3, 12, v1
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, v1
+; GFX12-NEXT: v_lshrrev_b16 v5, 14, v1
+; GFX12-NEXT: v_lshrrev_b16 v18, 4, v8
+; GFX12-NEXT: v_and_b32_e32 v2, 8, v1
+; GFX12-NEXT: v_lshrrev_b16 v9, 6, v1
+; GFX12-NEXT: v_and_b32_e32 v15, 8, v4
+; GFX12-NEXT: v_lshrrev_b16 v17, 1, v0
+; GFX12-NEXT: v_and_b32_e32 v0, 2, v3
+; GFX12-NEXT: v_lshrrev_b16 v7, 15, v1
+; GFX12-NEXT: v_and_b32_e32 v12, 2, v4
+; GFX12-NEXT: v_and_b32_e32 v10, 1, v5
+; GFX12-NEXT: v_and_b32_e32 v5, 2, v18
+; GFX12-NEXT: v_lshrrev_b16 v14, 3, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v9
+; GFX12-NEXT: v_lshrrev_b16 v9, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v6, 10, v1
+; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v7
+; GFX12-NEXT: v_lshrrev_b16 v7, 3, v15
+; GFX12-NEXT: v_lshrrev_b16 v12, 1, v12
+; GFX12-NEXT: v_lshrrev_b16 v13, 2, v1
+; GFX12-NEXT: v_lshrrev_b16 v19, 7, v8
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v18
+; GFX12-NEXT: v_lshrrev_b16 v18, 1, v5
+; GFX12-NEXT: v_and_b32_e32 v8, 1, v3
+; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX12-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX12-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v12
; GFX12-NEXT: v_and_b32_e32 v12, 1, v1
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v13
+; GFX12-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX12-NEXT: v_and_b32_e32 v14, 1, v13
; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v17
-; GFX12-NEXT: v_and_b32_e32 v6, 1, v11
-; GFX12-NEXT: v_and_b32_e32 v4, 1, v9
-; GFX12-NEXT: v_and_b32_e32 v10, 1, v8
-; GFX12-NEXT: v_and_b32_e32 v8, 1, v7
-; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v19
-; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v18
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v21
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v20
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v22
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v19
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v18
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2049,303 +2151,313 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_bfe_u32 s5, s4, 0x10003
-; GFX6-NEXT: s_bfe_u32 s6, s4, 0x10001
-; GFX6-NEXT: s_bfe_u32 s7, s4, 0x10007
-; GFX6-NEXT: s_bfe_u32 s8, s4, 0x10005
-; GFX6-NEXT: s_bfe_u32 s9, s4, 0x1000b
-; GFX6-NEXT: s_bfe_u32 s10, s4, 0x10009
-; GFX6-NEXT: s_bfe_u32 s11, s4, 0x1000f
-; GFX6-NEXT: s_bfe_u32 s12, s4, 0x1000d
-; GFX6-NEXT: s_bfe_u32 s13, s4, 0x10013
-; GFX6-NEXT: s_bfe_u32 s14, s4, 0x10011
-; GFX6-NEXT: s_bfe_u32 s15, s4, 0x10017
-; GFX6-NEXT: s_bfe_u32 s16, s4, 0x10015
-; GFX6-NEXT: s_bfe_u32 s17, s4, 0x1001b
-; GFX6-NEXT: s_bfe_u32 s18, s4, 0x10019
-; GFX6-NEXT: s_lshr_b32 s19, s4, 31
-; GFX6-NEXT: s_bfe_u32 s20, s4, 0x1001d
-; GFX6-NEXT: s_and_b32 s21, s4, 1
-; GFX6-NEXT: s_bfe_u32 s22, s4, 0x10002
-; GFX6-NEXT: s_bfe_u32 s23, s4, 0x10006
-; GFX6-NEXT: s_bfe_u32 s24, s4, 0x10004
-; GFX6-NEXT: s_bfe_u32 s25, s4, 0x1000a
-; GFX6-NEXT: s_bfe_u32 s26, s4, 0x10008
-; GFX6-NEXT: s_bfe_u32 s27, s4, 0x1000e
-; GFX6-NEXT: s_bfe_u32 s28, s4, 0x1000c
-; GFX6-NEXT: s_bfe_u32 s29, s4, 0x10012
-; GFX6-NEXT: s_bfe_u32 s30, s4, 0x10010
-; GFX6-NEXT: s_bfe_u32 s31, s4, 0x10016
-; GFX6-NEXT: s_bfe_u32 s33, s4, 0x10014
-; GFX6-NEXT: s_bfe_u32 s34, s4, 0x1001a
-; GFX6-NEXT: s_bfe_u32 s35, s4, 0x1001e
-; GFX6-NEXT: s_bfe_u32 s36, s4, 0x1001c
-; GFX6-NEXT: s_bfe_u32 s4, s4, 0x10018
-; GFX6-NEXT: v_mov_b32_e32 v0, s36
-; GFX6-NEXT: v_mov_b32_e32 v1, s20
-; GFX6-NEXT: v_mov_b32_e32 v2, s35
-; GFX6-NEXT: v_mov_b32_e32 v3, s19
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: v_mov_b32_e32 v1, s18
-; GFX6-NEXT: v_mov_b32_e32 v2, s34
-; GFX6-NEXT: v_mov_b32_e32 v3, s17
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s33
-; GFX6-NEXT: v_mov_b32_e32 v1, s16
-; GFX6-NEXT: v_mov_b32_e32 v2, s31
-; GFX6-NEXT: v_mov_b32_e32 v3, s15
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GFX6-NEXT: s_lshr_b32 s5, s4, 16
+; GFX6-NEXT: s_bfe_u32 s6, s4, 0x10007
+; GFX6-NEXT: s_bfe_u32 s7, s4, 0x40004
+; GFX6-NEXT: s_bfe_u32 s8, s4, 0x80008
+; GFX6-NEXT: s_bfe_u32 s9, s4, 0x1000f
+; GFX6-NEXT: s_bfe_u32 s10, s4, 0x4000c
+; GFX6-NEXT: s_lshr_b32 s11, s4, 24
+; GFX6-NEXT: s_lshr_b32 s12, s4, 31
+; GFX6-NEXT: s_lshr_b32 s13, s4, 28
+; GFX6-NEXT: s_bfe_u32 s14, s4, 0x10003
+; GFX6-NEXT: s_bfe_u32 s15, s4, 0x10001
+; GFX6-NEXT: s_and_b32 s16, s4, 0xff00
+; GFX6-NEXT: s_and_b32 s17, s4, 0xf0
+; GFX6-NEXT: s_bfe_u32 s18, s4, 0x10016
+; GFX6-NEXT: s_bfe_u32 s19, s4, 0x10006
+; GFX6-NEXT: s_bfe_u32 s20, s4, 0x1000a
+; GFX6-NEXT: s_bfe_u32 s21, s4, 0x1000e
+; GFX6-NEXT: s_bfe_u32 s22, s4, 0x10012
+; GFX6-NEXT: s_bfe_u32 s23, s4, 0x10010
+; GFX6-NEXT: s_bfe_u32 s24, s4, 0x1001a
+; GFX6-NEXT: s_bfe_u32 s25, s4, 0x10018
+; GFX6-NEXT: s_bfe_u32 s26, s4, 0x1001e
+; GFX6-NEXT: s_bfe_u32 s27, s4, 0x1001c
+; GFX6-NEXT: s_and_b32 s28, s4, 1
+; GFX6-NEXT: s_bfe_u32 s4, s4, 0x10002
+; GFX6-NEXT: s_bfe_u32 s29, s5, 0x10007
+; GFX6-NEXT: s_bfe_u32 s30, s5, 0x40004
+; GFX6-NEXT: s_bfe_u32 s7, s7, 0x10001
+; GFX6-NEXT: s_bfe_u32 s31, s8, 0x10003
+; GFX6-NEXT: s_bfe_u32 s8, s8, 0x10001
+; GFX6-NEXT: s_bfe_u32 s10, s10, 0x10001
+; GFX6-NEXT: s_bfe_u32 s33, s5, 0x10003
+; GFX6-NEXT: s_bfe_u32 s34, s5, 0x10001
+; GFX6-NEXT: s_bfe_u32 s35, s11, 0x10003
+; GFX6-NEXT: s_bfe_u32 s11, s11, 0x10001
+; GFX6-NEXT: s_bfe_u32 s13, s13, 0x10001
+; GFX6-NEXT: s_and_b32 s5, s5, 0xf0
+; GFX6-NEXT: s_bfe_u32 s17, s17, 0x10004
+; GFX6-NEXT: s_bfe_u32 s36, s16, 0x10008
+; GFX6-NEXT: s_bfe_u32 s16, s16, 0x1000c
+; GFX6-NEXT: v_mov_b32_e32 v0, s28
+; GFX6-NEXT: v_mov_b32_e32 v1, s15
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: v_mov_b32_e32 v3, s14
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s30
-; GFX6-NEXT: v_mov_b32_e32 v1, s14
-; GFX6-NEXT: v_mov_b32_e32 v2, s29
-; GFX6-NEXT: v_mov_b32_e32 v3, s13
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GFX6-NEXT: v_mov_b32_e32 v0, s27
+; GFX6-NEXT: v_mov_b32_e32 v2, s26
+; GFX6-NEXT: v_mov_b32_e32 v3, s12
+; GFX6-NEXT: v_mov_b32_e32 v4, s25
+; GFX6-NEXT: v_mov_b32_e32 v6, s24
+; GFX6-NEXT: v_mov_b32_e32 v8, s23
+; GFX6-NEXT: v_mov_b32_e32 v10, s22
+; GFX6-NEXT: v_mov_b32_e32 v12, s21
+; GFX6-NEXT: v_mov_b32_e32 v13, s9
+; GFX6-NEXT: v_mov_b32_e32 v14, s20
+; GFX6-NEXT: v_mov_b32_e32 v16, s19
+; GFX6-NEXT: v_mov_b32_e32 v17, s6
+; GFX6-NEXT: v_mov_b32_e32 v18, s18
+; GFX6-NEXT: s_bfe_u32 s4, s30, 0x10001
+; GFX6-NEXT: s_bfe_u32 s5, s5, 0x10004
+; GFX6-NEXT: v_mov_b32_e32 v1, s13
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GFX6-NEXT: v_mov_b32_e32 v5, s11
+; GFX6-NEXT: v_mov_b32_e32 v7, s35
+; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GFX6-NEXT: v_mov_b32_e32 v9, s34
+; GFX6-NEXT: v_mov_b32_e32 v11, s33
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s28
-; GFX6-NEXT: v_mov_b32_e32 v1, s12
-; GFX6-NEXT: v_mov_b32_e32 v2, s27
-; GFX6-NEXT: v_mov_b32_e32 v3, s11
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GFX6-NEXT: v_mov_b32_e32 v10, s16
+; GFX6-NEXT: v_mov_b32_e32 v11, s10
+; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s26
-; GFX6-NEXT: v_mov_b32_e32 v1, s10
-; GFX6-NEXT: v_mov_b32_e32 v2, s25
-; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GFX6-NEXT: v_mov_b32_e32 v12, s36
+; GFX6-NEXT: v_mov_b32_e32 v13, s8
+; GFX6-NEXT: v_mov_b32_e32 v15, s31
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s24
-; GFX6-NEXT: v_mov_b32_e32 v1, s8
-; GFX6-NEXT: v_mov_b32_e32 v2, s23
-; GFX6-NEXT: v_mov_b32_e32 v3, s7
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NEXT: v_mov_b32_e32 v14, s17
+; GFX6-NEXT: v_mov_b32_e32 v15, s7
+; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:16
+; GFX6-NEXT: v_mov_b32_e32 v19, s29
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s21
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: v_mov_b32_e32 v2, s22
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NEXT: v_mov_b32_e32 v16, s5
+; GFX6-NEXT: v_mov_b32_e32 v17, s4
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_zextload_v32i1_to_v32i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v2, 0xf0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s4
-; GFX8-NEXT: v_and_b32_e32 v24, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s4
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s4
-; GFX8-NEXT: v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s4
-; GFX8-NEXT: v_and_b32_e32 v23, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s4
-; GFX8-NEXT: s_lshr_b32 s2, s4, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 11, s4
-; GFX8-NEXT: v_and_b32_e32 v26, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
-; GFX8-NEXT: v_and_b32_e32 v17, 1, v2
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v3
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s2
-; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 3, s2
-; GFX8-NEXT: s_bfe_u32 s5, s4, 0x10018
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s2
-; GFX8-NEXT: s_and_b32 s6, s4, 1
-; GFX8-NEXT: s_bfe_u32 s7, s4, 0x10013
-; GFX8-NEXT: s_bfe_u32 s8, s4, 0x10012
-; GFX8-NEXT: s_bfe_u32 s9, s4, 0x10011
-; GFX8-NEXT: s_bfe_u32 s10, s4, 0x10010
-; GFX8-NEXT: s_bfe_u32 s2, s4, 0x10017
-; GFX8-NEXT: s_bfe_u32 s3, s4, 0x10016
-; GFX8-NEXT: s_bfe_u32 s11, s4, 0x10015
-; GFX8-NEXT: s_bfe_u32 s12, s4, 0x10014
-; GFX8-NEXT: v_mov_b32_e32 v11, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: v_mov_b32_e32 v10, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: s_lshr_b32 s2, s4, 16
+; GFX8-NEXT: v_and_b32_e64 v0, s2, 2
+; GFX8-NEXT: v_and_b32_e64 v1, s2, 8
+; GFX8-NEXT: s_lshr_b32 s5, s4, 24
+; GFX8-NEXT: v_and_b32_e32 v7, s2, v2
+; GFX8-NEXT: s_bfe_u32 s6, s4, 0x10016
+; GFX8-NEXT: s_bfe_u32 s7, s4, 0x10018
+; GFX8-NEXT: s_and_b32 s8, s4, 1
+; GFX8-NEXT: s_bfe_u32 s2, s4, 0x10012
+; GFX8-NEXT: s_bfe_u32 s3, s4, 0x10010
+; GFX8-NEXT: v_and_b32_e32 v6, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: s_add_u32 s2, s0, 64
-; GFX8-NEXT: v_mov_b32_e32 v8, s12
-; GFX8-NEXT: v_mov_b32_e32 v9, s11
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 3, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_lshrrev_b16_e32 v17, 4, v7
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v25, 2, s4
-; GFX8-NEXT: v_mov_b32_e32 v8, s10
-; GFX8-NEXT: v_mov_b32_e32 v9, s9
-; GFX8-NEXT: v_mov_b32_e32 v10, s8
-; GFX8-NEXT: v_mov_b32_e32 v11, s7
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s4
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v22
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v25
+; GFX8-NEXT: v_and_b32_e32 v0, 2, v17
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 12, s4
+; GFX8-NEXT: v_lshrrev_b16_e32 v14, 4, v6
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v7
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v17
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, 7, v6
+; GFX8-NEXT: v_and_b32_e32 v6, 1, v16
+; GFX8-NEXT: v_mov_b32_e32 v17, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 14, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 8, s4
+; GFX8-NEXT: v_and_b32_e32 v13, 2, v8
+; GFX8-NEXT: v_and_b32_e32 v15, 2, v14
+; GFX8-NEXT: v_mov_b32_e32 v16, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s4
+; GFX8-NEXT: v_and_b32_e32 v24, 2, v20
+; GFX8-NEXT: v_and_b32_e32 v22, 8, v20
+; GFX8-NEXT: v_and_b32_e64 v23, s4, 2
+; GFX8-NEXT: v_lshrrev_b16_e64 v25, 4, s5
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v15
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v14
+; GFX8-NEXT: v_and_b32_e32 v14, 1, v12
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 15, s4
+; GFX8-NEXT: v_lshrrev_b16_e32 v13, 1, v13
+; GFX8-NEXT: v_and_b32_e32 v12, 1, v8
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_and_b32_e32 v26, 2, v25
+; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; GFX8-NEXT: v_lshrrev_b16_e32 v17, 1, v23
+; GFX8-NEXT: v_and_b32_e32 v12, 1, v25
+; GFX8-NEXT: v_lshrrev_b16_e32 v23, 3, v22
; GFX8-NEXT: v_and_b32_e32 v22, 1, v21
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v24
+; GFX8-NEXT: v_lshrrev_b16_e32 v21, 1, v24
; GFX8-NEXT: v_mov_b32_e32 v25, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 12, s4
-; GFX8-NEXT: v_mov_b32_e32 v24, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v23
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s4
; GFX8-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v24, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 2, s4
+; GFX8-NEXT: v_and_b32_e64 v19, s4, 8
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 4, s4
-; GFX8-NEXT: v_mov_b32_e32 v8, 1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: v_and_b32_e32 v12, 1, v19
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v16
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX8-NEXT: v_and_b32_sdwa v16, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: s_add_u32 s2, s0, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_lshrrev_b16_e32 v19, 3, v19
+; GFX8-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX8-NEXT: v_mov_b32_e32 v16, s8
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s4
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 6, s5
; GFX8-NEXT: v_mov_b32_e32 v17, s3
-; GFX8-NEXT: v_and_b32_e32 v14, 1, v14
; GFX8-NEXT: v_mov_b32_e32 v16, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s5
+; GFX8-NEXT: v_and_b32_e32 v14, 1, v27
+; GFX8-NEXT: v_lshrrev_b16_e32 v13, 1, v26
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v26
-; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s5
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: v_and_b32_e64 v9, s5, 2
+; GFX8-NEXT: v_and_b32_e64 v11, s5, 8
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_lshrrev_b16_e32 v11, 3, v11
+; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9
+; GFX8-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX8-NEXT: v_mov_b32_e32 v8, s7
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
+; GFX8-NEXT: s_add_u32 s0, s0, 0x50
; GFX8-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
; GFX8-NEXT: v_mov_b32_e32 v8, s2
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v32i1_to_v32i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @12
-; EG-NEXT: ALU 76, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T24.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T18.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 1
+; EG-NEXT: ALU 85, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T16.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T13.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T12.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_32 T11.X, T11.X, 0, #1
; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: BFE_UINT * T12.W, T11.X, literal.x, 1,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T12.Z, T11.X, literal.x, 1,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T12.Y, T11.X, 1, 1,
-; EG-NEXT: BFE_UINT * T13.W, T11.X, literal.x, 1,
-; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T12.X, T11.X, 1,
-; EG-NEXT: BFE_UINT T13.Z, T11.X, literal.x, 1,
-; EG-NEXT: LSHR * T14.X, KC0[2].Y, literal.y,
-; EG-NEXT: 6(8.407791e-45), 2(2.802597e-45)
-; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, 1,
-; EG-NEXT: BFE_UINT * T15.W, T11.X, literal.y, 1,
-; EG-NEXT: 5(7.006492e-45), 11(1.541428e-44)
-; EG-NEXT: BFE_UINT T13.X, T11.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T15.Z, T11.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 4(5.605194e-45), 10(1.401298e-44)
+; EG-NEXT: MOV T11.X, KC0[2].Z,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 17:
+; EG-NEXT: LSHR T12.X, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT: LSHR T13.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
+; EG-NEXT: LSHR T14.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT: LSHR T15.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T16.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T15.Y, T11.X, literal.y, 1,
-; EG-NEXT: BFE_UINT * T17.W, T11.X, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44)
-; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T15.X, T11.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T17.Z, T11.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 8(1.121039e-44), 14(1.961818e-44)
-; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR * T17.W, T11.X, literal.z,
+; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T18.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T17.Y, T11.X, literal.y, 1,
-; EG-NEXT: BFE_UINT * T19.W, T11.X, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 13(1.821688e-44)
-; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T17.Z, T11.X, literal.y, 1,
+; EG-NEXT: LSHR * T19.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 30(4.203895e-44)
+; EG-NEXT: BFE_UINT T17.Y, T11.X, literal.x, 1,
+; EG-NEXT: BFE_UINT * T20.Z, T11.X, literal.y, 1,
+; EG-NEXT: 29(4.063766e-44), 2(2.802597e-45)
; EG-NEXT: BFE_UINT T17.X, T11.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T19.Z, T11.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 12(1.681558e-44), 18(2.522337e-44)
-; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T20.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T19.Y, T11.X, literal.y, 1,
-; EG-NEXT: BFE_UINT * T21.W, T11.X, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 17(2.382207e-44)
-; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T19.X, T11.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T20.Y, T11.X, 1, 1,
; EG-NEXT: BFE_UINT T21.Z, T11.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 16(2.242078e-44), 22(3.082857e-44)
-; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T22.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T21.Y, T11.X, literal.y, 1,
-; EG-NEXT: BFE_UINT * T23.W, T11.X, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44)
-; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, T11.X, literal.z,
+; EG-NEXT: LSHR * T1.W, T11.X, literal.w,
+; EG-NEXT: 28(3.923636e-44), 22(3.082857e-44)
+; EG-NEXT: 8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT: AND_INT T20.X, T11.X, 1,
+; EG-NEXT: BFE_UINT T21.Y, T11.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T22.Z, T11.X, literal.y, 1,
+; EG-NEXT: AND_INT T1.W, PS, literal.z,
+; EG-NEXT: LSHR * T2.W, T11.X, literal.w,
+; EG-NEXT: 21(2.942727e-44), 14(1.961818e-44)
+; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
; EG-NEXT: BFE_UINT T21.X, T11.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T23.Z, T11.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 20(2.802597e-44), 26(3.643376e-44)
-; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T24.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T22.Y, T11.X, literal.y, 1,
+; EG-NEXT: BFE_UINT T23.Z, T11.X, literal.z, 1,
+; EG-NEXT: AND_INT * T2.W, PS, literal.w,
+; EG-NEXT: 20(2.802597e-44), 13(1.821688e-44)
+; EG-NEXT: 6(8.407791e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T3.W, T11.X, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T22.X, T11.X, literal.x, 1,
; EG-NEXT: BFE_UINT T23.Y, T11.X, literal.y, 1,
-; EG-NEXT: LSHR * T11.W, T11.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 25(3.503246e-44)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T24.Z, T11.X, literal.z, 1,
+; EG-NEXT: AND_INT * T3.W, PV.W, literal.w,
+; EG-NEXT: 12(1.681558e-44), 5(7.006492e-45)
+; EG-NEXT: 10(1.401298e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T4.W, T11.X, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T23.X, T11.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T11.Z, T11.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 24(3.363116e-44), 30(4.203895e-44)
-; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T25.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT * T11.Y, T11.X, literal.y, 1,
-; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44)
+; EG-NEXT: BFE_UINT T24.Y, T11.X, literal.y, 1,
+; EG-NEXT: BFE_UINT T25.Z, T11.X, literal.z, 1,
+; EG-NEXT: AND_INT T4.W, PV.W, literal.w,
+; EG-NEXT: LSHR * T5.W, T11.X, literal.w,
+; EG-NEXT: 4(5.605194e-45), 9(1.261169e-44)
+; EG-NEXT: 18(2.522337e-44), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T24.X, T11.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T25.Y, T11.X, literal.y, 1,
+; EG-NEXT: BFE_UINT T11.Z, T11.X, literal.z, 1,
+; EG-NEXT: AND_INT T5.W, PS, literal.x,
+; EG-NEXT: LSHR * T23.W, PV.W, literal.w,
+; EG-NEXT: 8(1.121039e-44), 17(2.382207e-44)
+; EG-NEXT: 26(3.643376e-44), 3(4.203895e-45)
+; EG-NEXT: BFE_UINT T25.X, T11.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T11.Y, T11.X, literal.y, 1,
+; EG-NEXT: LSHR T0.Z, T11.X, literal.z,
+; EG-NEXT: LSHR T24.W, PV.W, literal.w,
+; EG-NEXT: LSHR * T22.W, T3.W, literal.w,
+; EG-NEXT: 16(2.242078e-44), 25(3.503246e-44)
+; EG-NEXT: 24(3.363116e-44), 3(4.203895e-45)
; EG-NEXT: BFE_UINT T11.X, T11.X, literal.x, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 28(3.923636e-44), 112(1.569454e-43)
-; EG-NEXT: LSHR * T26.X, PV.W, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Y, PV.Z, literal.y,
+; EG-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T25.W, T2.W, literal.w,
+; EG-NEXT: LSHR * T21.W, T1.W, literal.w,
+; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44)
+; EG-NEXT: 112(1.569454e-43), 3(4.203895e-45)
+; EG-NEXT: LSHR T26.X, PV.Z, literal.x,
+; EG-NEXT: LSHR T11.W, PV.Y, literal.y,
+; EG-NEXT: LSHR * T20.W, T0.W, literal.y,
+; EG-NEXT: 2(2.802597e-45), 3(4.203895e-45)
;
; GFX12-LABEL: constant_zextload_v32i1_to_v32i32:
; GFX12: ; %bb.0:
@@ -2354,81 +2466,92 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s3, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v1, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v10, 1, s2
-; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_and_b32 v33, 1, v1
-; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 4, s2
+; GFX12-NEXT: s_lshr_b32 s4, s2, 16
+; GFX12-NEXT: v_and_b32_e64 v2, 0xf0, s2
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
; GFX12-NEXT: v_lshrrev_b16 v7, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 2, s2
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s5, s2, 1
-; GFX12-NEXT: v_lshrrev_b16 v15, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
-; GFX12-NEXT: v_and_b32_e32 v25, 1, v14
-; GFX12-NEXT: v_and_b32_e32 v26, 1, v18
-; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v30, s6 :: v_dual_and_b32 v13, 1, v13
-; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10010
-; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10017
-; GFX12-NEXT: v_dual_mov_b32 v27, s9 :: v_dual_and_b32 v24, 1, v6
-; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10016
-; GFX12-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10014
-; GFX12-NEXT: v_and_b32_e32 v23, 1, v4
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015
-; GFX12-NEXT: v_and_b32_e32 v22, 1, v2
-; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10
-; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11
-; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7
-; GFX12-NEXT: v_and_b32_e32 v4, 1, v5
+; GFX12-NEXT: v_and_b32_e64 v9, s3, 2
+; GFX12-NEXT: v_and_b32_e64 v19, 0xf0, s4
+; GFX12-NEXT: v_and_b32_e64 v0, s2, 2
+; GFX12-NEXT: v_and_b32_e64 v1, s2, 8
+; GFX12-NEXT: v_lshrrev_b16 v8, 12, s2
+; GFX12-NEXT: v_and_b32_e64 v14, s4, 8
+; GFX12-NEXT: v_lshrrev_b16 v17, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v23, 4, v2
+; GFX12-NEXT: v_lshrrev_b16 v18, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 10, s2
+; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v20, 2, s3
+; GFX12-NEXT: v_and_b32_e64 v12, s3, 8
+; GFX12-NEXT: v_and_b32_e64 v13, s4, 2
+; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v15, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v25, 7, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v7
+; GFX12-NEXT: v_lshrrev_b16 v28, 1, v9
+; GFX12-NEXT: v_lshrrev_b16 v7, 4, v19
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
+; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10010
+; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10018
+; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_and_b32 v21, 8, v4
+; GFX12-NEXT: v_lshrrev_b16 v22, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v24, 3, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 2, v4
+; GFX12-NEXT: v_and_b32_e32 v0, 2, v8
+; GFX12-NEXT: v_lshrrev_b16 v29, 3, v14
+; GFX12-NEXT: v_lshrrev_b16 v9, 7, v19
+; GFX12-NEXT: v_and_b32_e32 v14, 1, v17
+; GFX12-NEXT: v_and_b32_e32 v17, 2, v23
+; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v5
; GFX12-NEXT: v_and_b32_e32 v10, 1, v3
-; GFX12-NEXT: v_and_b32_e32 v14, 1, v19
-; GFX12-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX12-NEXT: v_and_b32_e32 v18, 1, v16
-; GFX12-NEXT: v_and_b32_e32 v16, 1, v15
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v13
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v24
-; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_and_b32 v13, 0xffff, v26
-; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v23
-; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_and_b32 v23, 0xffff, v20
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v9
-; GFX12-NEXT: v_and_b32_e32 v20, 1, v0
-; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_and_b32 v17, 0xffff, v25
-; GFX12-NEXT: v_mov_b32_e32 v25, s2
-; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v22
-; GFX12-NEXT: v_and_b32_e32 v22, 1, v12
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_and_b32 v15, 0xffff, v21
-; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v33
+; GFX12-NEXT: v_lshrrev_b16 v26, 3, v12
+; GFX12-NEXT: v_lshrrev_b16 v27, 1, v13
+; GFX12-NEXT: v_and_b32_e32 v5, 2, v16
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v25
+; GFX12-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_and_b32 v25, 0xffff, v28
+; GFX12-NEXT: v_lshrrev_b16 v13, 3, v21
+; GFX12-NEXT: v_and_b32_e32 v21, 2, v7
+; GFX12-NEXT: v_lshrrev_b16 v30, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v19, 0xffff, v9
+; GFX12-NEXT: v_lshrrev_b16 v9, 1, v17
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v23
+; GFX12-NEXT: v_lshrrev_b16 v17, 1, v21
+; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v27
+; GFX12-NEXT: v_and_b32_e32 v27, 0xffff, v26
+; GFX12-NEXT: v_and_b32_e32 v26, 1, v20
+; GFX12-NEXT: v_mov_b32_e32 v20, s4
+; GFX12-NEXT: v_lshrrev_b16 v23, 1, v5
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:96
+; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v30
+; GFX12-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_and_b32 v31, 0xffff, v24
+; GFX12-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX12-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX12-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX12-NEXT: v_and_b32_e32 v12, 1, v16
+; GFX12-NEXT: v_and_b32_e32 v16, 1, v7
+; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v13
+; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v23
+; GFX12-NEXT: v_and_b32_e32 v23, 0xffff, v29
+; GFX12-NEXT: v_and_b32_e32 v30, 1, v18
+; GFX12-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_and_b32 v29, 0xffff, v22
+; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v18, s5
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:80
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2863,584 +2986,609 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10003
-; GFX6-NEXT: s_bfe_u32 s5, s2, 0x10001
-; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10007
-; GFX6-NEXT: s_bfe_u32 s7, s2, 0x10005
-; GFX6-NEXT: s_bfe_u32 s8, s2, 0x1000b
-; GFX6-NEXT: s_bfe_u32 s9, s2, 0x10009
-; GFX6-NEXT: s_bfe_u32 s10, s2, 0x1000f
-; GFX6-NEXT: s_bfe_u32 s11, s2, 0x1000d
-; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10013
-; GFX6-NEXT: s_bfe_u32 s13, s2, 0x10011
-; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10017
-; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10015
-; GFX6-NEXT: s_bfe_u32 s16, s2, 0x1001b
-; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10019
-; GFX6-NEXT: s_lshr_b32 s18, s2, 31
-; GFX6-NEXT: s_bfe_u32 s19, s2, 0x1001d
-; GFX6-NEXT: s_bfe_u32 s20, s3, 0x10003
-; GFX6-NEXT: s_bfe_u32 s21, s3, 0x10001
+; GFX6-NEXT: s_lshr_b32 s24, s2, 16
+; GFX6-NEXT: s_lshr_b32 s25, s3, 16
+; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10007
+; GFX6-NEXT: s_bfe_u32 s13, s2, 0x40004
+; GFX6-NEXT: s_bfe_u32 s17, s2, 0x80008
+; GFX6-NEXT: s_bfe_u32 s7, s2, 0x1000f
+; GFX6-NEXT: s_bfe_u32 s19, s2, 0x4000c
+; GFX6-NEXT: s_lshr_b32 s26, s2, 24
+; GFX6-NEXT: s_lshr_b32 s16, s2, 31
+; GFX6-NEXT: s_lshr_b32 s27, s2, 28
; GFX6-NEXT: s_bfe_u32 s22, s3, 0x10007
-; GFX6-NEXT: s_bfe_u32 s23, s3, 0x10005
-; GFX6-NEXT: s_bfe_u32 s24, s3, 0x1000b
-; GFX6-NEXT: s_bfe_u32 s25, s3, 0x10009
-; GFX6-NEXT: s_bfe_u32 s26, s3, 0x1000f
-; GFX6-NEXT: s_bfe_u32 s27, s3, 0x1000d
-; GFX6-NEXT: s_bfe_u32 s28, s3, 0x10013
-; GFX6-NEXT: s_bfe_u32 s29, s3, 0x10011
-; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10017
-; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10015
-; GFX6-NEXT: s_bfe_u32 s33, s3, 0x1001b
-; GFX6-NEXT: s_bfe_u32 s34, s3, 0x10019
-; GFX6-NEXT: s_lshr_b32 s35, s3, 31
-; GFX6-NEXT: s_bfe_u32 s36, s3, 0x1001d
-; GFX6-NEXT: s_and_b32 s37, s2, 1
-; GFX6-NEXT: s_bfe_u32 s38, s2, 0x10002
-; GFX6-NEXT: s_bfe_u32 s39, s2, 0x10006
-; GFX6-NEXT: s_bfe_u32 s40, s2, 0x10004
-; GFX6-NEXT: s_bfe_u32 s41, s2, 0x1000a
-; GFX6-NEXT: s_bfe_u32 s42, s2, 0x10008
-; GFX6-NEXT: s_bfe_u32 s43, s2, 0x1000e
-; GFX6-NEXT: s_bfe_u32 s44, s2, 0x1000c
-; GFX6-NEXT: s_bfe_u32 s45, s2, 0x10012
-; GFX6-NEXT: s_bfe_u32 s46, s2, 0x10010
-; GFX6-NEXT: s_bfe_u32 s47, s2, 0x10016
-; GFX6-NEXT: s_bfe_u32 s48, s2, 0x10014
-; GFX6-NEXT: s_bfe_u32 s49, s2, 0x1001a
-; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018
-; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001e
-; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c
-; GFX6-NEXT: s_and_b32 s53, s3, 1
-; GFX6-NEXT: s_bfe_u32 s54, s3, 0x10002
-; GFX6-NEXT: s_bfe_u32 s55, s3, 0x10006
-; GFX6-NEXT: s_bfe_u32 s56, s3, 0x10004
-; GFX6-NEXT: s_bfe_u32 s57, s3, 0x1000a
-; GFX6-NEXT: s_bfe_u32 s58, s3, 0x10008
-; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000e
-; GFX6-NEXT: s_bfe_u32 s60, s3, 0x10012
-; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10010
-; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10016
-; GFX6-NEXT: s_bfe_u32 s63, s3, 0x1001a
-; GFX6-NEXT: s_bfe_u32 s64, s3, 0x10018
-; GFX6-NEXT: s_bfe_u32 s65, s3, 0x1001e
-; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001c
-; GFX6-NEXT: s_bfe_u32 s67, s3, 0x10014
-; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1000c
+; GFX6-NEXT: s_bfe_u32 s28, s3, 0x40004
+; GFX6-NEXT: s_bfe_u32 s29, s3, 0x80008
+; GFX6-NEXT: s_bfe_u32 s30, s3, 0x1000f
+; GFX6-NEXT: s_bfe_u32 s31, s3, 0x4000c
+; GFX6-NEXT: s_lshr_b32 s33, s3, 24
+; GFX6-NEXT: s_lshr_b32 s34, s3, 31
+; GFX6-NEXT: s_lshr_b32 s35, s3, 28
+; GFX6-NEXT: s_bfe_u32 s36, s2, 0x10003
+; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10001
+; GFX6-NEXT: s_bfe_u32 s11, s3, 0x10003
+; GFX6-NEXT: s_bfe_u32 s9, s3, 0x10001
+; GFX6-NEXT: s_and_b32 s37, s3, 0xff00
+; GFX6-NEXT: s_and_b32 s38, s3, 0xf0
+; GFX6-NEXT: s_and_b32 s39, s2, 0xff00
+; GFX6-NEXT: s_and_b32 s40, s2, 0xf0
+; GFX6-NEXT: s_bfe_u32 s5, s2, 0x10016
+; GFX6-NEXT: s_bfe_u32 s6, s3, 0x10016
+; GFX6-NEXT: s_bfe_u32 s8, s2, 0x10006
+; GFX6-NEXT: s_bfe_u32 s10, s2, 0x1000a
+; GFX6-NEXT: s_bfe_u32 s14, s2, 0x1000e
+; GFX6-NEXT: s_bfe_u32 s20, s2, 0x10012
+; GFX6-NEXT: s_bfe_u32 s21, s2, 0x10010
+; GFX6-NEXT: s_bfe_u32 s41, s2, 0x1001a
+; GFX6-NEXT: s_bfe_u32 s42, s2, 0x10018
+; GFX6-NEXT: s_bfe_u32 s43, s2, 0x1001e
+; GFX6-NEXT: s_bfe_u32 s44, s2, 0x1001c
+; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10006
+; GFX6-NEXT: s_bfe_u32 s46, s3, 0x1000a
+; GFX6-NEXT: s_bfe_u32 s47, s3, 0x1000e
+; GFX6-NEXT: s_bfe_u32 s48, s3, 0x10012
+; GFX6-NEXT: s_bfe_u32 s49, s3, 0x10010
+; GFX6-NEXT: s_bfe_u32 s50, s3, 0x1001a
+; GFX6-NEXT: s_bfe_u32 s51, s3, 0x10018
+; GFX6-NEXT: s_bfe_u32 s52, s3, 0x1001e
+; GFX6-NEXT: s_bfe_u32 s53, s3, 0x1001c
+; GFX6-NEXT: s_and_b32 s18, s2, 1
+; GFX6-NEXT: s_bfe_u32 s23, s2, 0x10002
+; GFX6-NEXT: s_and_b32 s2, s3, 1
+; GFX6-NEXT: s_bfe_u32 s12, s3, 0x10002
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s66
-; GFX6-NEXT: v_mov_b32_e32 v1, s36
-; GFX6-NEXT: v_mov_b32_e32 v2, s65
-; GFX6-NEXT: v_mov_b32_e32 v3, s35
-; GFX6-NEXT: v_mov_b32_e32 v4, s64
-; GFX6-NEXT: v_mov_b32_e32 v5, s34
-; GFX6-NEXT: v_mov_b32_e32 v6, s63
-; GFX6-NEXT: v_mov_b32_e32 v7, s33
-; GFX6-NEXT: v_mov_b32_e32 v8, s67
-; GFX6-NEXT: v_mov_b32_e32 v9, s31
-; GFX6-NEXT: v_mov_b32_e32 v10, s62
-; GFX6-NEXT: v_mov_b32_e32 v11, s30
-; GFX6-NEXT: v_mov_b32_e32 v12, s61
-; GFX6-NEXT: v_mov_b32_e32 v13, s29
-; GFX6-NEXT: v_mov_b32_e32 v14, s60
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
-; GFX6-NEXT: v_mov_b32_e32 v15, s28
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GFX6-NEXT: s_waitcnt expcnt(3)
-; GFX6-NEXT: v_mov_b32_e32 v0, s68
-; GFX6-NEXT: v_mov_b32_e32 v1, s27
-; GFX6-NEXT: v_mov_b32_e32 v2, s59
-; GFX6-NEXT: v_mov_b32_e32 v3, s26
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GFX6-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-NEXT: s_bfe_u32 s9, s24, 0x10007
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
+; GFX6-NEXT: s_bfe_u32 s12, s24, 0x40004
+; GFX6-NEXT: v_mov_b32_e32 v3, s11
+; GFX6-NEXT: s_bfe_u32 s11, s25, 0x10007
+; GFX6-NEXT: v_mov_b32_e32 v4, s18
+; GFX6-NEXT: s_bfe_u32 s18, s25, 0x40004
+; GFX6-NEXT: s_bfe_u32 s13, s13, 0x10001
+; GFX6-NEXT: v_mov_b32_e32 v5, s15
+; GFX6-NEXT: s_bfe_u32 s15, s17, 0x10003
+; GFX6-NEXT: s_bfe_u32 s17, s17, 0x10001
+; GFX6-NEXT: s_bfe_u32 s19, s19, 0x10001
+; GFX6-NEXT: v_mov_b32_e32 v6, s23
+; GFX6-NEXT: s_bfe_u32 s23, s24, 0x10003
+; GFX6-NEXT: v_mov_b32_e32 v7, s36
+; GFX6-NEXT: s_bfe_u32 s36, s24, 0x10001
+; GFX6-NEXT: v_mov_b32_e32 v8, s53
+; GFX6-NEXT: s_bfe_u32 s53, s26, 0x10003
+; GFX6-NEXT: s_bfe_u32 s26, s26, 0x10001
+; GFX6-NEXT: s_bfe_u32 s27, s27, 0x10001
+; GFX6-NEXT: s_bfe_u32 s28, s28, 0x10001
+; GFX6-NEXT: v_mov_b32_e32 v10, s52
+; GFX6-NEXT: s_bfe_u32 s52, s29, 0x10003
+; GFX6-NEXT: s_bfe_u32 s29, s29, 0x10001
+; GFX6-NEXT: s_bfe_u32 s31, s31, 0x10001
+; GFX6-NEXT: v_mov_b32_e32 v11, s34
+; GFX6-NEXT: s_bfe_u32 s34, s25, 0x10003
+; GFX6-NEXT: v_mov_b32_e32 v12, s51
+; GFX6-NEXT: s_bfe_u32 s51, s25, 0x10001
+; GFX6-NEXT: v_mov_b32_e32 v14, s50
+; GFX6-NEXT: s_bfe_u32 s50, s33, 0x10003
+; GFX6-NEXT: s_bfe_u32 s33, s33, 0x10001
+; GFX6-NEXT: s_bfe_u32 s35, s35, 0x10001
+; GFX6-NEXT: s_and_b32 s25, s25, 0xf0
+; GFX6-NEXT: s_and_b32 s24, s24, 0xf0
+; GFX6-NEXT: s_bfe_u32 s40, s40, 0x10004
+; GFX6-NEXT: v_mov_b32_e32 v16, s49
+; GFX6-NEXT: s_bfe_u32 s49, s39, 0x10008
+; GFX6-NEXT: s_bfe_u32 s39, s39, 0x1000c
+; GFX6-NEXT: s_bfe_u32 s38, s38, 0x10004
+; GFX6-NEXT: v_mov_b32_e32 v18, s48
+; GFX6-NEXT: s_bfe_u32 s48, s37, 0x10008
+; GFX6-NEXT: s_bfe_u32 s37, s37, 0x1000c
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GFX6-NEXT: s_waitcnt expcnt(1)
+; GFX6-NEXT: v_mov_b32_e32 v2, s47
+; GFX6-NEXT: v_mov_b32_e32 v3, s30
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s58
-; GFX6-NEXT: v_mov_b32_e32 v1, s25
-; GFX6-NEXT: v_mov_b32_e32 v2, s57
-; GFX6-NEXT: v_mov_b32_e32 v3, s24
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GFX6-NEXT: v_mov_b32_e32 v4, s46
+; GFX6-NEXT: v_mov_b32_e32 v9, s35
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240
+; GFX6-NEXT: v_mov_b32_e32 v6, s45
+; GFX6-NEXT: v_mov_b32_e32 v7, s22
+; GFX6-NEXT: v_mov_b32_e32 v13, s33
+; GFX6-NEXT: v_mov_b32_e32 v15, s50
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224
+; GFX6-NEXT: s_waitcnt expcnt(1)
+; GFX6-NEXT: v_mov_b32_e32 v8, s44
+; GFX6-NEXT: v_mov_b32_e32 v10, s43
+; GFX6-NEXT: v_mov_b32_e32 v11, s16
+; GFX6-NEXT: v_mov_b32_e32 v17, s51
+; GFX6-NEXT: v_mov_b32_e32 v19, s34
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:192
+; GFX6-NEXT: s_waitcnt expcnt(1)
+; GFX6-NEXT: v_mov_b32_e32 v12, s42
+; GFX6-NEXT: v_mov_b32_e32 v14, s41
+; GFX6-NEXT: v_mov_b32_e32 v0, s37
+; GFX6-NEXT: v_mov_b32_e32 v1, s31
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GFX6-NEXT: s_waitcnt expcnt(1)
+; GFX6-NEXT: v_mov_b32_e32 v16, s21
+; GFX6-NEXT: v_mov_b32_e32 v18, s20
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s56
-; GFX6-NEXT: v_mov_b32_e32 v1, s23
-; GFX6-NEXT: v_mov_b32_e32 v2, s55
-; GFX6-NEXT: v_mov_b32_e32 v3, s22
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GFX6-NEXT: v_mov_b32_e32 v2, s48
+; GFX6-NEXT: v_mov_b32_e32 v3, s29
+; GFX6-NEXT: v_mov_b32_e32 v5, s52
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s53
-; GFX6-NEXT: v_mov_b32_e32 v1, s21
-; GFX6-NEXT: v_mov_b32_e32 v2, s54
-; GFX6-NEXT: v_mov_b32_e32 v3, s20
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: v_mov_b32_e32 v3, s7
+; GFX6-NEXT: v_mov_b32_e32 v4, s38
+; GFX6-NEXT: v_mov_b32_e32 v5, s28
+; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s52
+; GFX6-NEXT: v_mov_b32_e32 v4, s10
+; GFX6-NEXT: v_mov_b32_e32 v9, s27
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
+; GFX6-NEXT: v_mov_b32_e32 v6, s8
+; GFX6-NEXT: v_mov_b32_e32 v7, s4
+; GFX6-NEXT: v_mov_b32_e32 v13, s26
+; GFX6-NEXT: v_mov_b32_e32 v15, s53
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96
+; GFX6-NEXT: s_waitcnt expcnt(1)
+; GFX6-NEXT: v_mov_b32_e32 v8, s6
+; GFX6-NEXT: v_mov_b32_e32 v17, s36
+; GFX6-NEXT: v_mov_b32_e32 v19, s23
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; GFX6-NEXT: v_mov_b32_e32 v10, s5
+; GFX6-NEXT: s_bfe_u32 s4, s12, 0x10001
+; GFX6-NEXT: s_bfe_u32 s5, s18, 0x10001
+; GFX6-NEXT: s_bfe_u32 s6, s24, 0x10004
+; GFX6-NEXT: s_bfe_u32 s7, s25, 0x10004
+; GFX6-NEXT: v_mov_b32_e32 v0, s39
; GFX6-NEXT: v_mov_b32_e32 v1, s19
-; GFX6-NEXT: v_mov_b32_e32 v2, s51
-; GFX6-NEXT: v_mov_b32_e32 v3, s18
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s50
-; GFX6-NEXT: v_mov_b32_e32 v1, s17
; GFX6-NEXT: v_mov_b32_e32 v2, s49
-; GFX6-NEXT: v_mov_b32_e32 v3, s16
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s48
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
-; GFX6-NEXT: v_mov_b32_e32 v2, s47
-; GFX6-NEXT: v_mov_b32_e32 v3, s14
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s46
-; GFX6-NEXT: v_mov_b32_e32 v1, s13
-; GFX6-NEXT: v_mov_b32_e32 v2, s45
-; GFX6-NEXT: v_mov_b32_e32 v3, s12
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s44
-; GFX6-NEXT: v_mov_b32_e32 v1, s11
-; GFX6-NEXT: v_mov_b32_e32 v2, s43
-; GFX6-NEXT: v_mov_b32_e32 v3, s10
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GFX6-NEXT: v_mov_b32_e32 v3, s17
+; GFX6-NEXT: v_mov_b32_e32 v5, s15
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s42
-; GFX6-NEXT: v_mov_b32_e32 v1, s9
-; GFX6-NEXT: v_mov_b32_e32 v2, s41
-; GFX6-NEXT: v_mov_b32_e32 v3, s8
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GFX6-NEXT: v_mov_b32_e32 v4, s40
+; GFX6-NEXT: v_mov_b32_e32 v5, s13
+; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX6-NEXT: v_mov_b32_e32 v9, s11
+; GFX6-NEXT: v_mov_b32_e32 v11, s9
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s40
-; GFX6-NEXT: v_mov_b32_e32 v1, s7
-; GFX6-NEXT: v_mov_b32_e32 v2, s39
-; GFX6-NEXT: v_mov_b32_e32 v3, s6
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NEXT: v_mov_b32_e32 v6, s7
+; GFX6-NEXT: v_mov_b32_e32 v7, s5
+; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:208
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s37
-; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: v_mov_b32_e32 v2, s38
-; GFX6-NEXT: v_mov_b32_e32 v3, s4
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NEXT: v_mov_b32_e32 v8, s6
+; GFX6-NEXT: v_mov_b32_e32 v9, s4
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:80
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_zextload_v64i1_to_v64i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v0, 0xf0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s6, s3, 24
-; GFX8-NEXT: s_lshr_b32 s8, s2, 24
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX8-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX8-NEXT: s_and_b32 s7, s3, 1
-; GFX8-NEXT: s_and_b32 s9, s2, 1
-; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s13, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s14, s2, 0x10011
+; GFX8-NEXT: s_lshr_b32 s10, s3, 16
+; GFX8-NEXT: s_lshr_b32 s9, s2, 16
+; GFX8-NEXT: s_lshr_b32 s8, s3, 24
+; GFX8-NEXT: s_lshr_b32 s7, s2, 24
+; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10016
+; GFX8-NEXT: s_bfe_u32 s5, s3, 0x10016
+; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10018
+; GFX8-NEXT: s_bfe_u32 s11, s3, 0x10018
+; GFX8-NEXT: s_bfe_u32 s14, s2, 0x10012
; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10017
-; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s18, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s20, s3, 0x10013
-; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10012
-; GFX8-NEXT: s_bfe_u32 s22, s3, 0x10011
-; GFX8-NEXT: s_bfe_u32 s23, s3, 0x10010
-; GFX8-NEXT: s_bfe_u32 s10, s3, 0x10017
-; GFX8-NEXT: s_bfe_u32 s11, s3, 0x10016
-; GFX8-NEXT: s_bfe_u32 s24, s3, 0x10015
-; GFX8-NEXT: s_bfe_u32 s25, s3, 0x10014
-; GFX8-NEXT: v_mov_b32_e32 v25, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xd0
-; GFX8-NEXT: v_mov_b32_e32 v24, s11
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v27, s11
-; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xc0
-; GFX8-NEXT: v_mov_b32_e32 v22, s25
-; GFX8-NEXT: v_mov_b32_e32 v23, s24
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: v_mov_b32_e32 v27, s11
-; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0x50
-; GFX8-NEXT: v_mov_b32_e32 v22, s23
-; GFX8-NEXT: v_mov_b32_e32 v23, s22
-; GFX8-NEXT: v_mov_b32_e32 v24, s21
-; GFX8-NEXT: v_mov_b32_e32 v25, s20
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: v_mov_b32_e32 v27, s11
-; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 64
-; GFX8-NEXT: v_mov_b32_e32 v22, s19
-; GFX8-NEXT: v_mov_b32_e32 v23, s18
-; GFX8-NEXT: v_mov_b32_e32 v24, s17
-; GFX8-NEXT: v_mov_b32_e32 v25, s16
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: v_mov_b32_e32 v27, s11
-; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 48
-; GFX8-NEXT: v_mov_b32_e32 v22, s15
-; GFX8-NEXT: v_mov_b32_e32 v23, s14
-; GFX8-NEXT: v_mov_b32_e32 v24, s13
-; GFX8-NEXT: v_mov_b32_e32 v25, s12
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2
-; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3
-; GFX8-NEXT: v_mov_b32_e32 v25, s11
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2
-; GFX8-NEXT: v_and_b32_e32 v21, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v27, 1, v22
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s3
-; GFX8-NEXT: v_mov_b32_e32 v24, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 32
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2
-; GFX8-NEXT: v_and_b32_e32 v28, 1, v22
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v20
+; GFX8-NEXT: s_and_b32 s16, s3, 1
+; GFX8-NEXT: s_bfe_u32 s17, s3, 0x10012
+; GFX8-NEXT: s_bfe_u32 s18, s3, 0x10010
+; GFX8-NEXT: s_and_b32 s19, s2, 1
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 12, s2
+; GFX8-NEXT: s_add_u32 s12, s0, 48
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s2
+; GFX8-NEXT: v_and_b32_e32 v24, 2, v20
+; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: v_and_b32_e32 v22, 1, v21
+; GFX8-NEXT: v_lshrrev_b16_e32 v21, 1, v24
+; GFX8-NEXT: v_mov_b32_e32 v25, s13
+; GFX8-NEXT: v_mov_b32_e32 v24, s12
+; GFX8-NEXT: s_add_u32 s12, s0, 32
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 8, s2
; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX8-NEXT: v_and_b32_e32 v20, 1, v19
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2
-; GFX8-NEXT: v_and_b32_e32 v17, 1, v1
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v20
+; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 10, s2
+; GFX8-NEXT: v_and_b32_e32 v26, 2, v18
+; GFX8-NEXT: v_and_b32_e32 v27, 8, v18
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s3
-; GFX8-NEXT: v_mov_b32_e32 v25, 1
-; GFX8-NEXT: v_mov_b32_e32 v21, s11
-; GFX8-NEXT: v_and_b32_e32 v12, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2
-; GFX8-NEXT: v_and_b32_e32 v23, 1, v19
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v16
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX8-NEXT: v_and_b32_sdwa v16, v14, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v20, s10
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s6
-; GFX8-NEXT: v_and_b32_e32 v15, 1, v0
-; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
-; GFX8-NEXT: v_and_b32_e32 v20, 1, v14
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6
-; GFX8-NEXT: s_add_u32 s10, s0, 16
-; GFX8-NEXT: v_and_b32_e32 v17, 1, v14
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v15
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 3, s6
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2
-; GFX8-NEXT: v_and_b32_e32 v19, 1, v15
-; GFX8-NEXT: v_mov_b32_e32 v16, s11
-; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2
-; GFX8-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX8-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX8-NEXT: v_mov_b32_e32 v15, s10
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v0
-; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14]
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s8
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
-; GFX8-NEXT: v_and_b32_e32 v15, 1, v11
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v10
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v9
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s10, s0, 0xb0
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s8
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3
-; GFX8-NEXT: v_and_b32_e32 v6, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s3
-; GFX8-NEXT: v_and_b32_e32 v11, 1, v8
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 3, s8
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v9, s10
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v13, 1, v8
-; GFX8-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT: v_mov_b32_e32 v10, s11
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 2, s6
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 4, s8
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s8
-; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8]
+; GFX8-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX8-NEXT: v_mov_b32_e32 v23, s13
+; GFX8-NEXT: v_lshrrev_b16_e32 v21, 3, v27
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v19
+; GFX8-NEXT: v_lshrrev_b16_e32 v19, 1, v26
+; GFX8-NEXT: v_mov_b32_e32 v22, s12
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s2
+; GFX8-NEXT: v_and_b32_e64 v16, s2, 2
+; GFX8-NEXT: v_and_b32_e64 v17, s2, 8
+; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
+; GFX8-NEXT: s_add_u32 s12, s0, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v20, s1
+; GFX8-NEXT: v_and_b32_e64 v13, s10, 2
+; GFX8-NEXT: v_and_b32_e64 v14, s10, 8
+; GFX8-NEXT: v_lshrrev_b16_e32 v18, 3, v17
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 1, v16
+; GFX8-NEXT: v_and_b32_e32 v17, 1, v15
+; GFX8-NEXT: v_mov_b32_e32 v15, s19
+; GFX8-NEXT: v_mov_b32_e32 v19, s0
+; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
+; GFX8-NEXT: v_lshrrev_b16_e32 v20, 3, v14
+; GFX8-NEXT: v_lshrrev_b16_e32 v18, 1, v13
+; GFX8-NEXT: v_mov_b32_e32 v14, s13
+; GFX8-NEXT: v_mov_b32_e32 v13, s12
+; GFX8-NEXT: s_add_u32 s12, s0, 0xb0
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 12, s3
+; GFX8-NEXT: v_mov_b32_e32 v17, s18
+; GFX8-NEXT: v_mov_b32_e32 v19, s17
+; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: v_and_b32_e32 v11, 2, v10
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 14, s3
+; GFX8-NEXT: v_and_b32_e32 v24, s2, v0
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 6, s2
+; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[17:20]
; GFX8-NEXT: s_add_u32 s2, s0, 0xa0
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v13
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v17
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s3
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v16
-; GFX8-NEXT: v_and_b32_e32 v8, 1, v14
-; GFX8-NEXT: v_and_b32_e32 v14, 1, v18
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v3
-; GFX8-NEXT: v_and_b32_sdwa v16, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v19
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v4
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x90
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v1
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v20
-; GFX8-NEXT: v_and_b32_e32 v20, 1, v0
+; GFX8-NEXT: v_mov_b32_e32 v18, s13
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 8, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 10, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 2, s3
+; GFX8-NEXT: v_and_b32_e64 v4, s3, 2
+; GFX8-NEXT: v_and_b32_e64 v5, s3, 8
+; GFX8-NEXT: v_and_b32_e32 v26, s3, v0
+; GFX8-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 15, s3
+; GFX8-NEXT: v_lshrrev_b16_e32 v11, 1, v11
+; GFX8-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX8-NEXT: v_mov_b32_e32 v17, s12
+; GFX8-NEXT: v_lshrrev_b16_e64 v31, 6, s3
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[10:13]
+; GFX8-NEXT: v_and_b32_e32 v7, 2, v6
+; GFX8-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NEXT: v_and_b32_e32 v9, 8, v6
+; GFX8-NEXT: v_mov_b32_e32 v10, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x80
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v4
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v23
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v22
-; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v27
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v26
+; GFX8-NEXT: v_lshrrev_b16_e32 v9, 3, v9
+; GFX8-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7
+; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v28
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xf0
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s6
-; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[1:4]
+; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v4
+; GFX8-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, 3, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 1, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s16
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 7, s6
-; GFX8-NEXT: v_and_b32_e32 v16, 1, v24
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s8
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
+; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
+; GFX8-NEXT: v_and_b32_e64 v1, s9, 2
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_and_b32_e64 v2, s9, 8
+; GFX8-NEXT: v_mov_b32_e32 v5, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, 3, v2
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, s15
+; GFX8-NEXT: v_mov_b32_e32 v3, s14
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_and_b32_e32 v6, 1, v12
-; GFX8-NEXT: v_mov_b32_e32 v12, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_lshrrev_b16_e32 v25, 4, v24
+; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_and_b32_e32 v28, 2, v25
+; GFX8-NEXT: v_and_b32_e32 v12, s9, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0xf0
+; GFX8-NEXT: v_and_b32_e32 v7, s10, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v13, 4, v12
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v24
+; GFX8-NEXT: v_and_b32_e32 v2, 1, v27
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v28
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v25
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 4, s8
+; GFX8-NEXT: v_and_b32_e32 v17, 2, v13
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_and_b32_e32 v22, 2, v21
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v12
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v13
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 6, s8
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v17
+; GFX8-NEXT: v_and_b32_e32 v19, 1, v23
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 7, s8
+; GFX8-NEXT: v_lshrrev_b16_e32 v18, 1, v22
+; GFX8-NEXT: v_and_b32_e32 v17, 1, v21
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[17:20]
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s8
+; GFX8-NEXT: v_and_b32_e64 v16, s8, 2
+; GFX8-NEXT: v_and_b32_e64 v14, s8, 8
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0x90
+; GFX8-NEXT: v_lshrrev_b16_e32 v29, 4, v26
+; GFX8-NEXT: v_lshrrev_b16_e32 v24, 7, v26
+; GFX8-NEXT: v_lshrrev_b16_e32 v28, 3, v14
+; GFX8-NEXT: v_lshrrev_b16_e32 v26, 1, v16
+; GFX8-NEXT: v_and_b32_e32 v27, 1, v15
+; GFX8-NEXT: v_mov_b32_e32 v25, s11
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[25:28]
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: v_and_b32_e32 v30, 2, v29
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-NEXT: v_and_b32_e32 v23, 1, v31
+; GFX8-NEXT: v_lshrrev_b16_e32 v22, 1, v30
+; GFX8-NEXT: v_and_b32_e32 v21, 1, v29
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v11
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NEXT: v_lshrrev_b16_e64 v32, 4, s7
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[21:24]
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: v_and_b32_e32 v33, 2, v32
+; GFX8-NEXT: v_lshrrev_b16_e64 v34, 6, s7
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 7, s7
+; GFX8-NEXT: v_and_b32_e32 v19, 1, v34
+; GFX8-NEXT: v_lshrrev_b16_e32 v18, 1, v33
+; GFX8-NEXT: v_and_b32_e32 v17, 1, v32
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, 4, v7
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[17:20]
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s7
+; GFX8-NEXT: v_and_b32_e64 v9, s7, 2
+; GFX8-NEXT: v_and_b32_e64 v8, s7, 8
+; GFX8-NEXT: v_and_b32_e32 v11, 2, v6
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0xd0
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v11
+; GFX8-NEXT: v_lshrrev_b16_e32 v11, 3, v8
+; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9
+; GFX8-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX8-NEXT: v_mov_b32_e32 v8, s6
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; GFX8-NEXT: s_add_u32 s0, s0, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, 7, v7
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, s5
+; GFX8-NEXT: v_mov_b32_e32 v8, s2
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v64i1_to_v64i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 6, @24, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @22
-; EG-NEXT: ALU 96, @25, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 57, @122, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T49.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T47.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T45.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T43.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T39.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T37.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T21.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T34.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T32.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T30.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T28.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T24.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 1
+; EG-NEXT: ALU 86, @31, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 83, @118, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T36.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T34.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T33.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T31.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T30.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T29.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T28.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T27.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T22.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T21.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T20.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_64 T21.XY, T19.X, 0, #1
+; EG-NEXT: VTX_READ_64 T19.XY, T19.X, 0, #1
; EG-NEXT: ALU clause starting at 24:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T20.X, PV.W, literal.x,
; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 25:
-; EG-NEXT: BFE_UINT * T19.W, T21.X, literal.x, 1,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T19.Z, T21.X, literal.x, 1,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T19.Y, T21.X, 1, 1,
-; EG-NEXT: BFE_UINT * T20.W, T21.X, literal.x, 1,
-; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T19.X, T21.X, 1,
-; EG-NEXT: BFE_UINT T20.Z, T21.X, literal.x, 1,
-; EG-NEXT: LSHR * T22.X, KC0[2].Y, literal.y,
-; EG-NEXT: 6(8.407791e-45), 2(2.802597e-45)
-; EG-NEXT: BFE_UINT T20.Y, T21.X, literal.x, 1,
-; EG-NEXT: BFE_UINT * T23.W, T21.X, literal.y, 1,
-; EG-NEXT: 5(7.006492e-45), 11(1.541428e-44)
-; EG-NEXT: BFE_UINT T20.X, T21.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T23.Z, T21.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 4(5.605194e-45), 10(1.401298e-44)
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T24.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T23.Y, T21.X, literal.y, 1,
-; EG-NEXT: BFE_UINT * T25.W, T21.X, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44)
-; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T23.X, T21.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T25.Z, T21.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 8(1.121039e-44), 14(1.961818e-44)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 31:
+; EG-NEXT: LSHR T21.X, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
+; EG-NEXT: LSHR T22.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT: LSHR T24.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
+; EG-NEXT: LSHR T25.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; EG-NEXT: LSHR T26.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T25.Y, T21.X, literal.y, 1,
-; EG-NEXT: BFE_UINT * T27.W, T21.X, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 13(1.821688e-44)
-; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T25.X, T21.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T27.Z, T21.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 12(1.681558e-44), 18(2.522337e-44)
-; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
+; EG-NEXT: LSHR T27.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; EG-NEXT: LSHR T28.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T27.Y, T21.X, literal.y, 1,
-; EG-NEXT: BFE_UINT * T29.W, T21.X, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 17(2.382207e-44)
-; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T27.X, T21.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T29.Z, T21.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 16(2.242078e-44), 22(3.082857e-44)
-; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
+; EG-NEXT: LSHR T29.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
; EG-NEXT: LSHR T30.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T29.Y, T21.X, literal.y, 1,
-; EG-NEXT: BFE_UINT * T31.W, T21.X, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44)
-; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T29.X, T21.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T31.Z, T21.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 20(2.802597e-44), 26(3.643376e-44)
-; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T32.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T31.Y, T21.X, literal.y, 1,
-; EG-NEXT: LSHR * T33.W, T21.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 25(3.503246e-44)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T31.X, T21.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T33.Z, T21.X, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 24(3.363116e-44), 30(4.203895e-44)
-; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43)
+; EG-NEXT: LSHR T31.X, PV.W, literal.x,
+; EG-NEXT: LSHR T32.W, T19.X, literal.y,
+; EG-NEXT: LSHR * T33.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: BFE_UINT T32.Z, T19.X, literal.x, 1,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 30(4.203895e-44), 128(1.793662e-43)
; EG-NEXT: LSHR T34.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T33.Y, T21.X, literal.y, 1,
-; EG-NEXT: BFE_UINT * T35.W, T21.Y, literal.z, 1,
+; EG-NEXT: BFE_UINT T32.Y, T19.X, literal.y, 1,
+; EG-NEXT: AND_INT T0.Z, T19.Y, literal.z,
+; EG-NEXT: LSHR * T0.W, T19.Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44)
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T33.X, T21.X, literal.x, 1,
-; EG-NEXT: BFE_UINT T35.Z, T21.Y, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 28(3.923636e-44), 2(2.802597e-45)
-; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T21.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T35.Y, T21.Y, 1, 1,
-; EG-NEXT: BFE_UINT T36.W, T21.Y, literal.y, 1,
-; EG-NEXT: AND_INT * T35.X, T21.Y, 1,
-; EG-NEXT: 2(2.802597e-45), 7(9.809089e-45)
-; EG-NEXT: BFE_UINT T36.Z, T21.Y, literal.x, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 6(8.407791e-45), 128(1.793662e-43)
-; EG-NEXT: LSHR T37.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T36.Y, T21.Y, literal.y, 1,
-; EG-NEXT: BFE_UINT * T38.W, T21.Y, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 5(7.006492e-45)
-; EG-NEXT: 11(1.541428e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T36.X, T21.Y, literal.x, 1,
-; EG-NEXT: BFE_UINT T38.Z, T21.Y, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 4(5.605194e-45), 10(1.401298e-44)
-; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 122:
-; EG-NEXT: LSHR T39.X, T0.W, literal.x,
-; EG-NEXT: BFE_UINT T38.Y, T21.Y, literal.y, 1,
-; EG-NEXT: BFE_UINT * T40.W, T21.Y, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44)
-; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T38.X, T21.Y, literal.x, 1,
-; EG-NEXT: BFE_UINT T40.Z, T21.Y, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 8(1.121039e-44), 14(1.961818e-44)
-; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T41.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T40.Y, T21.Y, literal.y, 1,
-; EG-NEXT: BFE_UINT * T42.W, T21.Y, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 13(1.821688e-44)
-; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T40.X, T21.Y, literal.x, 1,
-; EG-NEXT: BFE_UINT T42.Z, T21.Y, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 12(1.681558e-44), 18(2.522337e-44)
-; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T43.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T42.Y, T21.Y, literal.y, 1,
-; EG-NEXT: BFE_UINT * T44.W, T21.Y, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 17(2.382207e-44)
-; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T42.X, T21.Y, literal.x, 1,
-; EG-NEXT: BFE_UINT T44.Z, T21.Y, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 16(2.242078e-44), 22(3.082857e-44)
-; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T45.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T44.Y, T21.Y, literal.y, 1,
-; EG-NEXT: BFE_UINT * T46.W, T21.Y, literal.z, 1,
-; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44)
-; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T44.X, T21.Y, literal.x, 1,
-; EG-NEXT: BFE_UINT T46.Z, T21.Y, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 20(2.802597e-44), 26(3.643376e-44)
-; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T47.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T46.Y, T21.Y, literal.y, 1,
-; EG-NEXT: LSHR * T48.W, T21.Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 25(3.503246e-44)
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: LSHR * T35.W, T19.Y, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T46.X, T21.Y, literal.x, 1,
-; EG-NEXT: BFE_UINT T48.Z, T21.Y, literal.y, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 24(3.363116e-44), 30(4.203895e-44)
-; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T49.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT * T48.Y, T21.Y, literal.y, 1,
+; EG-NEXT: BFE_UINT T32.X, T19.X, literal.x, 1,
+; EG-NEXT: AND_INT T0.Y, T0.W, literal.y,
+; EG-NEXT: BFE_UINT T35.Z, T19.Y, literal.z, 1,
+; EG-NEXT: LSHR * T0.W, T19.Y, literal.w,
+; EG-NEXT: 28(3.923636e-44), 8(1.121039e-44)
+; EG-NEXT: 30(4.203895e-44), 16(2.242078e-44)
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T36.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T35.Y, T19.Y, literal.y, 1,
+; EG-NEXT: BFE_UINT T37.Z, T19.Y, literal.x, 1,
+; EG-NEXT: AND_INT T0.W, T0.W, literal.z,
+; EG-NEXT: LSHR * T1.W, T19.Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44)
-; EG-NEXT: BFE_UINT T48.X, T21.Y, literal.x, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 28(3.923636e-44), 240(3.363116e-43)
-; EG-NEXT: LSHR * T50.X, PV.W, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: 8(1.121039e-44), 12(1.681558e-44)
+; EG-NEXT: BFE_UINT T35.X, T19.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T37.Y, T19.Y, 1, 1,
+; EG-NEXT: BFE_UINT T38.Z, T19.Y, literal.y, 1,
+; EG-NEXT: AND_INT T1.W, PS, literal.z,
+; EG-NEXT: LSHR * T2.W, T19.Y, literal.w,
+; EG-NEXT: 28(3.923636e-44), 26(3.643376e-44)
+; EG-NEXT: 8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT: AND_INT T37.X, T19.Y, 1,
+; EG-NEXT: BFE_UINT T38.Y, T19.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T39.Z, T19.Y, literal.y, 1,
+; EG-NEXT: LSHR T3.W, T19.Y, literal.z,
+; EG-NEXT: LSHR * T4.W, T19.Y, literal.w,
+; EG-NEXT: 25(3.503246e-44), 18(2.522337e-44)
+; EG-NEXT: 4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T38.X, T19.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T39.Y, T19.Y, literal.y, 1,
+; EG-NEXT: BFE_UINT T40.Z, T19.Y, literal.z, 1,
+; EG-NEXT: AND_INT T4.W, PS, literal.w,
+; EG-NEXT: AND_INT * T3.W, PV.W, literal.w,
+; EG-NEXT: 24(3.363116e-44), 17(2.382207e-44)
+; EG-NEXT: 10(1.401298e-44), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T39.X, T19.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T40.Y, T19.Y, literal.y, 1,
+; EG-NEXT: BFE_UINT T41.Z, T19.X, literal.z, 1,
+; EG-NEXT: LSHR * T5.W, T19.X, literal.w,
+; EG-NEXT: 16(2.242078e-44), 9(1.261169e-44)
+; EG-NEXT: 26(3.643376e-44), 20(2.802597e-44)
+; EG-NEXT: ALU clause starting at 118:
+; EG-NEXT: LSHR * T6.W, T19.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T40.X, T19.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T41.Y, T19.X, literal.y, 1,
+; EG-NEXT: BFE_UINT T42.Z, T19.X, literal.z, 1,
+; EG-NEXT: AND_INT T6.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T5.W, T5.W, literal.x,
+; EG-NEXT: 8(1.121039e-44), 25(3.503246e-44)
+; EG-NEXT: 18(2.522337e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T41.X, T19.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T42.Y, T19.X, literal.y, 1,
+; EG-NEXT: BFE_UINT T43.Z, T19.X, literal.z, 1,
+; EG-NEXT: LSHR * T7.W, T19.X, literal.w,
+; EG-NEXT: 24(3.363116e-44), 17(2.382207e-44)
+; EG-NEXT: 10(1.401298e-44), 12(1.681558e-44)
+; EG-NEXT: LSHR * T8.W, T19.X, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T42.X, T19.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T43.Y, T19.X, literal.y, 1,
+; EG-NEXT: BFE_UINT T44.Z, T19.X, literal.z, 1,
+; EG-NEXT: AND_INT T8.W, PV.W, literal.w,
+; EG-NEXT: AND_INT * T7.W, T7.W, literal.w,
+; EG-NEXT: 16(2.242078e-44), 9(1.261169e-44)
+; EG-NEXT: 6(8.407791e-45), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T43.X, T19.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T44.Y, T19.X, literal.y, 1,
+; EG-NEXT: BFE_UINT T45.Z, T19.X, literal.z, 1,
+; EG-NEXT: LSHR T9.W, T19.X, literal.w,
+; EG-NEXT: LSHR * T10.W, T19.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 5(7.006492e-45)
+; EG-NEXT: 14(1.961818e-44), 4(5.605194e-45)
+; EG-NEXT: BFE_UINT T44.X, T19.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T45.Y, T19.X, literal.y, 1,
+; EG-NEXT: BFE_UINT T46.Z, T19.X, literal.z, 1,
+; EG-NEXT: AND_INT T10.W, PS, literal.w,
+; EG-NEXT: AND_INT * T9.W, PV.W, literal.w,
+; EG-NEXT: 4(5.605194e-45), 13(1.821688e-44)
+; EG-NEXT: 22(3.082857e-44), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T45.X, T19.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T46.Y, T19.X, literal.y, 1,
+; EG-NEXT: BFE_UINT T47.Z, T19.Y, literal.z, 1,
+; EG-NEXT: LSHR T44.W, PS, literal.w,
+; EG-NEXT: LSHR * T43.W, PV.W, literal.w,
+; EG-NEXT: 12(1.681558e-44), 21(2.942727e-44)
+; EG-NEXT: 6(8.407791e-45), 3(4.203895e-45)
+; EG-NEXT: BFE_UINT T46.X, T19.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T47.Y, T19.Y, literal.y, 1,
+; EG-NEXT: BFE_UINT T48.Z, T19.Y, literal.z, 1,
+; EG-NEXT: LSHR T45.W, T7.W, literal.w,
+; EG-NEXT: LSHR * T42.W, T8.W, literal.w,
+; EG-NEXT: 20(2.802597e-44), 5(7.006492e-45)
+; EG-NEXT: 14(1.961818e-44), 3(4.203895e-45)
+; EG-NEXT: BFE_UINT T47.X, T19.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T48.Y, T19.Y, literal.y, 1,
+; EG-NEXT: BFE_UINT T49.Z, T19.Y, literal.z, 1,
+; EG-NEXT: LSHR T46.W, T5.W, literal.w,
+; EG-NEXT: LSHR * T41.W, T6.W, literal.w,
+; EG-NEXT: 4(5.605194e-45), 13(1.821688e-44)
+; EG-NEXT: 22(3.082857e-44), 3(4.203895e-45)
+; EG-NEXT: BFE_UINT T48.X, T19.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T49.Y, T19.Y, literal.y, 1,
+; EG-NEXT: BFE_UINT T50.Z, T19.X, literal.z, 1,
+; EG-NEXT: LSHR T47.W, T3.W, literal.w,
+; EG-NEXT: LSHR * T40.W, T4.W, literal.w,
+; EG-NEXT: 12(1.681558e-44), 21(2.942727e-44)
+; EG-NEXT: 2(2.802597e-45), 3(4.203895e-45)
+; EG-NEXT: BFE_UINT T49.X, T19.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T50.Y, T19.X, 1, 1,
+; EG-NEXT: AND_INT T1.Z, T2.W, literal.y,
+; EG-NEXT: LSHR T48.W, T1.W, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T39.W, T0.W, literal.z,
+; EG-NEXT: 20(2.802597e-44), 8(1.121039e-44)
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T50.X, T19.X, 1,
+; EG-NEXT: AND_INT T1.Y, T19.X, literal.x,
+; EG-NEXT: ADD_INT T2.Z, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T49.W, PV.Z, literal.z,
+; EG-NEXT: LSHR * T38.W, T0.Y, literal.z,
+; EG-NEXT: 8(1.121039e-44), 240(3.363116e-43)
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T19.X, PV.Z, literal.x,
+; EG-NEXT: LSHR T50.W, PV.Y, literal.y,
+; EG-NEXT: LSHR * T37.W, T0.Z, literal.y,
+; EG-NEXT: 2(2.802597e-45), 3(4.203895e-45)
;
; GFX12-LABEL: constant_zextload_v64i1_to_v64i32:
; GFX12: ; %bb.0:
@@ -3448,163 +3596,178 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v2, 13, s2
+; GFX12-NEXT: v_and_b32_e64 v1, s3, 2
+; GFX12-NEXT: v_lshrrev_b16 v25, 12, s2
+; GFX12-NEXT: v_and_b32_e64 v2, 0xf0, s2
; GFX12-NEXT: s_lshr_b32 s4, s3, 24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: v_lshrrev_b16 v10, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v3, 9, s2
-; GFX12-NEXT: v_and_b32_e32 v45, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 1, s4
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v14, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 5, s4
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: s_and_b32 s6, s3, 1
-; GFX12-NEXT: s_bfe_u32 s14, s3, 0x10012
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v55, s14 :: v_dual_and_b32 v36, 1, v10
-; GFX12-NEXT: v_and_b32_e32 v10, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 3, s4
-; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v13, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v15, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 3, s3
-; GFX12-NEXT: v_and_b32_e32 v43, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
-; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10014
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v49, s19 :: v_dual_and_b32 v42, 1, v3
-; GFX12-NEXT: v_lshrrev_b16 v3, 5, s5
-; GFX12-NEXT: s_bfe_u32 s13, s3, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v29, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3
-; GFX12-NEXT: v_dual_mov_b32 v56, s13 :: v_dual_and_b32 v27, 1, v12
-; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
-; GFX12-NEXT: v_and_b32_e32 v12, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 1, s5
-; GFX12-NEXT: s_and_b32 s7, s2, 1
-; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10011
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v54, s15 :: v_dual_and_b32 v35, 1, v8
-; GFX12-NEXT: v_lshrrev_b16 v8, 7, s5
-; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10010
-; GFX12-NEXT: v_dual_mov_b32 v53, s16 :: v_dual_and_b32 v40, 1, v7
-; GFX12-NEXT: v_lshrrev_b16 v7, 2, s5
-; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10017
-; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10016
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v51, s18 :: v_dual_and_b32 v44, 1, v5
-; GFX12-NEXT: v_lshrrev_b16 v5, 4, s5
-; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10015
-; GFX12-NEXT: v_and_b32_e32 v23, 1, v14
-; GFX12-NEXT: v_and_b32_e32 v14, 1, v18
-; GFX12-NEXT: v_lshrrev_b16 v18, 6, s5
-; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015
-; GFX12-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_and_b32 v39, 1, v6
-; GFX12-NEXT: v_and_b32_e32 v32, 1, v11
-; GFX12-NEXT: v_lshrrev_b16 v11, 2, s4
-; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012
-; GFX12-NEXT: v_and_b32_e32 v20, 1, v16
-; GFX12-NEXT: v_lshrrev_b16 v16, 7, s4
-; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010
-; GFX12-NEXT: v_and_b32_e32 v24, 1, v15
-; GFX12-NEXT: v_lshrrev_b16 v15, 6, s4
-; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017
-; GFX12-NEXT: v_mov_b32_e32 v50, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
-; GFX12-NEXT: v_and_b32_e32 v28, 1, v13
-; GFX12-NEXT: v_lshrrev_b16 v13, 4, s4
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: v_and_b32_e32 v6, 1, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v17, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v1, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v41, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v37, 4, s2
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10014
-; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX12-NEXT: v_and_b32_e32 v29, 1, v29
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:192
-; GFX12-NEXT: v_mov_b32_e32 v52, s12
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v3
-; GFX12-NEXT: v_dual_mov_b32 v54, s10 :: v_dual_and_b32 v3, 1, v7
-; GFX12-NEXT: v_dual_mov_b32 v56, s8 :: v_dual_and_b32 v7, 1, v18
-; GFX12-NEXT: v_dual_mov_b32 v49, s2 :: v_dual_mov_b32 v50, s13
-; GFX12-NEXT: v_mov_b32_e32 v51, s3
-; GFX12-NEXT: v_dual_mov_b32 v53, s11 :: v_dual_and_b32 v18, 0xffff, v24
+; GFX12-NEXT: v_and_b32_e64 v4, 0xf0, s3
+; GFX12-NEXT: v_lshrrev_b16 v17, 4, s4
+; GFX12-NEXT: v_lshrrev_b16 v53, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 2, v25
+; GFX12-NEXT: v_lshrrev_b16 v22, 4, v2
+; GFX12-NEXT: s_lshr_b32 s5, s3, 16
+; GFX12-NEXT: s_lshr_b32 s6, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v29, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v41, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v37, 12, s3
+; GFX12-NEXT: v_and_b32_e64 v5, s5, 2
+; GFX12-NEXT: v_and_b32_e64 v9, s6, 8
+; GFX12-NEXT: v_lshrrev_b16 v23, 7, v2
+; GFX12-NEXT: v_lshrrev_b16 v18, 4, v4
+; GFX12-NEXT: v_and_b32_e32 v2, 2, v17
+; GFX12-NEXT: v_lshrrev_b16 v46, 1, v1
+; GFX12-NEXT: s_bfe_u32 s11, s3, 0x10018
+; GFX12-NEXT: v_and_b32_e32 v1, 2, v22
+; GFX12-NEXT: s_lshr_b32 s7, s2, 16
+; GFX12-NEXT: v_lshrrev_b16 v33, 8, s3
+; GFX12-NEXT: v_and_b32_e64 v6, s5, 8
+; GFX12-NEXT: v_and_b32_e64 v10, s7, 2
+; GFX12-NEXT: v_and_b32_e32 v13, 8, v41
+; GFX12-NEXT: v_lshrrev_b16 v42, 1, v5
+; GFX12-NEXT: v_lshrrev_b16 v31, 3, v9
+; GFX12-NEXT: v_lshrrev_b16 v24, 1, v2
+; GFX12-NEXT: v_lshrrev_b16 v32, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 2, v18
+; GFX12-NEXT: v_lshrrev_b16 v9, 4, s6
+; GFX12-NEXT: v_and_b32_e64 v2, 0xf0, s7
+; GFX12-NEXT: v_and_b32_e64 v5, 0xf0, s5
+; GFX12-NEXT: v_and_b32_e32 v52, 0xffff, v29
+; GFX12-NEXT: v_and_b32_e64 v29, s3, 8
+; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10012
+; GFX12-NEXT: v_and_b32_e32 v14, 2, v37
+; GFX12-NEXT: v_lshrrev_b16 v7, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v38, 14, s3
+; GFX12-NEXT: v_lshrrev_b16 v39, 15, s3
+; GFX12-NEXT: v_lshrrev_b16 v34, 10, s3
+; GFX12-NEXT: v_and_b32_e64 v11, s7, 8
+; GFX12-NEXT: v_lshrrev_b16 v44, 3, v6
+; GFX12-NEXT: v_lshrrev_b16 v26, 1, v10
+; GFX12-NEXT: v_lshrrev_b16 v48, 3, v13
+; GFX12-NEXT: v_lshrrev_b16 v45, 1, v14
+; GFX12-NEXT: v_lshrrev_b16 v14, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 2, v9
+; GFX12-NEXT: v_lshrrev_b16 v6, 4, v2
+; GFX12-NEXT: v_lshrrev_b16 v13, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v10, 4, v5
+; GFX12-NEXT: v_lshrrev_b16 v29, 3, v29
+; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10016
+; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10018
+; GFX12-NEXT: v_and_b32_e32 v16, 8, v33
+; GFX12-NEXT: s_bfe_u32 s10, s3, 0x10010
+; GFX12-NEXT: v_and_b32_e32 v12, 2, v41
+; GFX12-NEXT: s_and_b32 s3, s3, 1
+; GFX12-NEXT: v_and_b32_e32 v15, 2, v33
+; GFX12-NEXT: v_lshrrev_b16 v27, 14, s2
+; GFX12-NEXT: v_and_b32_e64 v8, s6, 2
+; GFX12-NEXT: v_lshrrev_b16 v30, 2, s6
+; GFX12-NEXT: v_lshrrev_b16 v28, 3, v11
+; GFX12-NEXT: v_lshrrev_b16 v54, 1, v12
+; GFX12-NEXT: v_lshrrev_b16 v40, 1, v15
+; GFX12-NEXT: v_lshrrev_b16 v15, 7, v4
+; GFX12-NEXT: v_lshrrev_b16 v11, 6, s6
+; GFX12-NEXT: v_lshrrev_b16 v12, 7, s6
+; GFX12-NEXT: v_and_b32_e32 v49, 1, v25
+; GFX12-NEXT: v_and_b32_e64 v25, s4, 2
+; GFX12-NEXT: v_and_b32_e32 v55, 0xffff, v29
+; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012
+; GFX12-NEXT: v_dual_mov_b32 v29, s8 :: v_dual_and_b32 v4, 2, v10
+; GFX12-NEXT: v_lshrrev_b16 v36, 3, v16
+; GFX12-NEXT: v_lshrrev_b16 v16, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 2, v6
+; GFX12-NEXT: v_and_b32_e64 v3, s2, 2
+; GFX12-NEXT: v_lshrrev_b16 v43, 10, s2
+; GFX12-NEXT: v_lshrrev_b16 v21, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v35, 1, v8
+; GFX12-NEXT: v_lshrrev_b16 v2, 7, v2
+; GFX12-NEXT: v_lshrrev_b16 v8, 1, v1
+; GFX12-NEXT: v_lshrrev_b16 v5, 7, v5
+; GFX12-NEXT: v_lshrrev_b16 v47, 1, v4
+; GFX12-NEXT: v_and_b32_e32 v51, 1, v27
+; GFX12-NEXT: v_and_b32_e32 v50, 0xffff, v46
+; GFX12-NEXT: v_and_b32_e32 v46, 0xffff, v54
+; GFX12-NEXT: v_lshrrev_b16 v25, 1, v25
+; GFX12-NEXT: v_and_b32_e32 v54, 1, v7
+; GFX12-NEXT: v_and_b32_e64 v7, s2, 8
+; GFX12-NEXT: v_and_b32_e64 v27, s4, 8
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX12-NEXT: v_and_b32_e32 v1, 1, v6
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v8
+; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX12-NEXT: v_and_b32_e32 v5, 1, v10
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v47
+; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v16
+; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v15
+; GFX12-NEXT: v_and_b32_e32 v15, 1, v13
+; GFX12-NEXT: v_and_b32_e32 v13, 1, v18
+; GFX12-NEXT: v_and_b32_e32 v18, 0xffff, v24
; GFX12-NEXT: v_and_b32_e32 v24, 0xffff, v23
-; GFX12-NEXT: v_and_b32_e32 v23, 1, v22
-; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v28
-; GFX12-NEXT: v_and_b32_e32 v28, 0xffff, v27
-; GFX12-NEXT: v_and_b32_e32 v27, 1, v26
-; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v32
+; GFX12-NEXT: v_and_b32_e32 v23, 1, v21
+; GFX12-NEXT: v_and_b32_e32 v21, 1, v22
+; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v32
; GFX12-NEXT: v_and_b32_e32 v32, 0xffff, v31
; GFX12-NEXT: v_and_b32_e32 v31, 1, v30
-; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v36
-; GFX12-NEXT: v_and_b32_e32 v19, 1, v19
-; GFX12-NEXT: v_and_b32_e32 v25, 1, v25
-; GFX12-NEXT: v_dual_mov_b32 v55, s9 :: v_dual_and_b32 v48, 0xffff, v17
-; GFX12-NEXT: v_dual_mov_b32 v17, s6 :: v_dual_and_b32 v20, 0xffff, v20
-; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX12-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX12-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v35
+; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v35
; GFX12-NEXT: v_and_b32_e32 v35, 1, v34
; GFX12-NEXT: v_and_b32_e32 v34, 0xffff, v40
; GFX12-NEXT: v_and_b32_e32 v40, 0xffff, v39
; GFX12-NEXT: v_and_b32_e32 v39, 1, v38
-; GFX12-NEXT: v_and_b32_e32 v38, 0xffff, v44
-; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v43
-; GFX12-NEXT: v_and_b32_e32 v43, 1, v41
-; GFX12-NEXT: v_and_b32_e32 v47, 1, v9
-; GFX12-NEXT: v_and_b32_e32 v46, 0xffff, v45
-; GFX12-NEXT: v_and_b32_e32 v45, 1, v1
-; GFX12-NEXT: v_and_b32_e32 v41, 1, v33
-; GFX12-NEXT: v_dual_mov_b32 v33, s7 :: v_dual_and_b32 v14, 0xffff, v14
-; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_and_b32 v42, 0xffff, v42
-; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX12-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX12-NEXT: v_and_b32_e32 v38, 0xffff, v45
+; GFX12-NEXT: v_and_b32_e32 v47, 1, v43
+; GFX12-NEXT: v_and_b32_e32 v45, 1, v41
+; GFX12-NEXT: v_and_b32_e32 v48, 0xffff, v48
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:48
+; GFX12-NEXT: v_and_b32_e32 v49, 0xffff, v25
+; GFX12-NEXT: v_lshrrev_b16 v3, 1, v3
+; GFX12-NEXT: v_lshrrev_b16 v25, 2, s2
+; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:32
+; GFX12-NEXT: v_lshrrev_b16 v7, 3, v7
+; GFX12-NEXT: v_lshrrev_b16 v27, 3, v27
+; GFX12-NEXT: v_lshrrev_b16 v41, 2, s4
+; GFX12-NEXT: v_lshrrev_b16 v19, 6, s4
+; GFX12-NEXT: v_lshrrev_b16 v20, 7, s4
+; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10016
+; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10010
+; GFX12-NEXT: s_and_b32 s2, s2, 1
; GFX12-NEXT: v_and_b32_e32 v37, 1, v37
-; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v6, 0xffff, v6
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: s_clause 0xd
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1]
-; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:160
-; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:144
-; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:128
-; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:96
+; GFX12-NEXT: v_and_b32_e32 v59, 0xffff, v7
+; GFX12-NEXT: v_and_b32_e32 v58, 1, v25
+; GFX12-NEXT: v_dual_mov_b32 v56, s2 :: v_dual_and_b32 v57, 0xffff, v3
+; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX12-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX12-NEXT: v_and_b32_e32 v33, 1, v33
+; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v44
+; GFX12-NEXT: v_and_b32_e32 v42, 0xffff, v42
+; GFX12-NEXT: v_and_b32_e32 v51, 0xffff, v27
+; GFX12-NEXT: v_and_b32_e32 v50, 1, v41
+; GFX12-NEXT: v_dual_mov_b32 v48, s11 :: v_dual_mov_b32 v41, s10
+; GFX12-NEXT: v_dual_mov_b32 v43, s9 :: v_dual_and_b32 v20, 0xffff, v20
+; GFX12-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX12-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_and_b32 v28, 0xffff, v28
+; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX12-NEXT: v_dual_mov_b32 v52, s3 :: v_dual_and_b32 v53, 0xffff, v53
+; GFX12-NEXT: v_mov_b32_e32 v25, s7
+; GFX12-NEXT: v_dual_mov_b32 v27, s6 :: v_dual_and_b32 v14, 0xffff, v14
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v0, v[56:59], s[0:1]
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[48:51], s[0:1] offset:224
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:240
+; GFX12-NEXT: global_store_b128 v0, v[52:55], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:80
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5336,23 +5499,24 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX6-NEXT: v_mov_b32_e32 v7, v1
; GFX6-NEXT: v_mov_b32_e32 v9, v1
; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v13, v1
-; GFX6-NEXT: v_mov_b32_e32 v15, v1
; GFX6-NEXT: s_mov_b32 s0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_bfe_u32 v14, v0, 1, 1
-; GFX6-NEXT: v_bfe_u32 v10, v0, 3, 1
-; GFX6-NEXT: v_bfe_u32 v6, v0, 5, 1
+; GFX6-NEXT: v_lshrrev_b32_e32 v13, 4, v0
+; GFX6-NEXT: v_bfe_u32 v10, v0, 1, 1
+; GFX6-NEXT: v_bfe_u32 v6, v0, 3, 1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v0
-; GFX6-NEXT: v_and_b32_e32 v12, 1, v0
-; GFX6-NEXT: v_bfe_u32 v8, v0, 2, 1
-; GFX6-NEXT: v_bfe_u32 v4, v0, 4, 1
+; GFX6-NEXT: v_bfe_u32 v12, v0, 4, 1
+; GFX6-NEXT: v_and_b32_e32 v8, 1, v0
+; GFX6-NEXT: v_bfe_u32 v4, v0, 2, 1
; GFX6-NEXT: v_bfe_u32 v0, v0, 6, 1
+; GFX6-NEXT: v_bfe_u32 v14, v13, 1, 1
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GFX6-NEXT: v_mov_b32_e32 v13, v1
+; GFX6-NEXT: v_mov_b32_e32 v15, v1
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_zextload_v8i1_to_v8i64:
@@ -5366,16 +5530,14 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v19, s3
; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v17, s1
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v16, s0
; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_mov_b32_e32 v23, s1
+; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
; GFX8-NEXT: v_mov_b32_e32 v9, v1
@@ -5383,36 +5545,35 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v13, v1
; GFX8-NEXT: v_mov_b32_e32 v15, v1
; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: v_mov_b32_e32 v22, s0
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
-; GFX8-NEXT: v_and_b32_e32 v8, 1, v2
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, 4, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v10, 5, v0
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
-; GFX8-NEXT: v_lshrrev_b16_e32 v14, 3, v0
-; GFX8-NEXT: v_and_b32_e32 v12, 1, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v14, 6, v0
+; GFX8-NEXT: v_and_b32_e32 v6, 8, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0
+; GFX8-NEXT: v_and_b32_e32 v10, 2, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v12, 4, v0
+; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 7, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v24, 1, v10
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v14
-; GFX8-NEXT: v_and_b32_e32 v14, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v6
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v24
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v14
+; GFX8-NEXT: v_and_b32_e32 v22, 2, v12
+; GFX8-NEXT: v_lshrrev_b16_e32 v10, 1, v10
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, 3, v6
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[4:7]
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
-; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX8-NEXT: v_lshrrev_b16_e32 v14, 1, v22
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v8i1_to_v8i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 30, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 37, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T12.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
@@ -5424,34 +5585,41 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: BFE_UINT * T6.Z, T5.X, literal.x, 1,
-; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
+; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T6.X, T5.X, literal.x, 1,
; EG-NEXT: MOV T6.Y, 0.0,
-; EG-NEXT: BFE_UINT * T7.Z, T5.X, literal.y, 1,
-; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45)
-; EG-NEXT: BFE_UINT T7.X, T5.X, literal.x, 1,
-; EG-NEXT: MOV T7.Y, 0.0,
-; EG-NEXT: BFE_UINT * T8.Z, T5.X, literal.y, 1,
-; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45)
+; EG-NEXT: BFE_UINT T7.Z, T5.X, 1, 1,
+; EG-NEXT: AND_INT * T7.X, T5.X, 1,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T7.Y, 0.0,
; EG-NEXT: BFE_UINT T8.X, T5.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, T5.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T8.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T8.Y, 0.0,
-; EG-NEXT: BFE_UINT T5.Z, T5.X, 1, 1,
-; EG-NEXT: AND_INT * T5.X, T5.X, 1,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, T5.X, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T5.X, T5.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 6(8.407791e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T5.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: MOV T6.W, 0.0,
; EG-NEXT: MOV * T7.W, 0.0,
; EG-NEXT: MOV T8.W, 0.0,
; EG-NEXT: MOV * T5.W, 0.0,
-; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T9.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T10.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T11.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
+; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T12.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
@@ -5462,30 +5630,30 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v12, 1, v0
-; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0
-; GFX12-NEXT: v_lshrrev_b16 v8, 3, v0
-; GFX12-NEXT: v_lshrrev_b16 v14, 1, v0
+; GFX12-NEXT: v_and_b32_e32 v10, 8, v0
+; GFX12-NEXT: v_lshrrev_b16 v4, 4, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v12, 2, v0
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v8, 1, v0
+; GFX12-NEXT: v_mov_b32_e32 v7, v1
; GFX12-NEXT: v_lshrrev_b16 v2, 7, v0
; GFX12-NEXT: v_lshrrev_b16 v6, 6, v0
-; GFX12-NEXT: v_lshrrev_b16 v10, 4, v0
-; GFX12-NEXT: v_and_b32_e32 v17, 1, v4
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v18, 1, v8
-; GFX12-NEXT: v_lshrrev_b16 v16, 2, v0
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v14, 1, v14
-; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v0, 1, v6
-; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v8, 1, v16
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v4, 1, v10
-; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v6, 0xffff, v17
-; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v18
+; GFX12-NEXT: v_lshrrev_b16 v14, 2, v0
+; GFX12-NEXT: v_and_b32_e32 v0, 2, v4
+; GFX12-NEXT: v_lshrrev_b16 v10, 3, v10
+; GFX12-NEXT: v_lshrrev_b16 v15, 1, v12
+; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v12, 1, v4
+; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v4, 1, v14
+; GFX12-NEXT: v_lshrrev_b16 v14, 1, v0
+; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v0, 1, v6
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v10
+; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v10, 0xffff, v15
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1]
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5699,7 +5867,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s8, s6
; GFX6-NEXT: s_mov_b32 s9, s7
-; GFX6-NEXT: buffer_load_ushort v29, off, s[8:11], 0
+; GFX6-NEXT: buffer_load_ushort v25, off, s[8:11], 0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v4, v1
@@ -5719,32 +5887,36 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX6-NEXT: s_mov_b32 s0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_bfe_u32 v2, v29, 11, 1
-; GFX6-NEXT: v_bfe_u32 v0, v29, 10, 1
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GFX6-NEXT: v_bfe_u32 v5, v29, 9, 1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 15, v25
+; GFX6-NEXT: v_bfe_u32 v0, v25, 14, 1
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GFX6-NEXT: v_bfe_u32 v5, v25, 3, 1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_bfe_u32 v3, v29, 8, 1
-; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:64
-; GFX6-NEXT: v_lshrrev_b32_e32 v8, 15, v29
+; GFX6-NEXT: v_bfe_u32 v3, v25, 2, 1
+; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16
+; GFX6-NEXT: v_bfe_u32 v8, v25, 1, 1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_bfe_u32 v6, v29, 14, 1
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
-; GFX6-NEXT: v_bfe_u32 v27, v29, 5, 1
-; GFX6-NEXT: v_bfe_u32 v23, v29, 7, 1
-; GFX6-NEXT: v_bfe_u32 v19, v29, 1, 1
-; GFX6-NEXT: v_bfe_u32 v15, v29, 3, 1
-; GFX6-NEXT: v_bfe_u32 v11, v29, 13, 1
-; GFX6-NEXT: v_bfe_u32 v25, v29, 4, 1
-; GFX6-NEXT: v_bfe_u32 v21, v29, 6, 1
-; GFX6-NEXT: v_and_b32_e32 v17, 1, v29
-; GFX6-NEXT: v_bfe_u32 v13, v29, 2, 1
+; GFX6-NEXT: v_and_b32_e32 v6, 1, v25
+; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
+; GFX6-NEXT: v_bfe_u32 v0, v25, 4, 4
+; GFX6-NEXT: v_bfe_u32 v27, v0, 1, 1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 12, v25
+; GFX6-NEXT: v_bfe_u32 v23, v0, 1, 1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v25
+; GFX6-NEXT: v_bfe_u32 v19, v0, 1, 1
+; GFX6-NEXT: v_bfe_u32 v15, v0, 3, 1
+; GFX6-NEXT: v_bfe_u32 v11, v25, 7, 1
+; GFX6-NEXT: v_bfe_u32 v21, v25, 12, 1
+; GFX6-NEXT: v_bfe_u32 v17, v25, 8, 1
+; GFX6-NEXT: v_bfe_u32 v13, v25, 10, 1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_bfe_u32 v9, v29, 12, 1
-; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96
-; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:16
-; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0
-; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48
+; GFX6-NEXT: v_bfe_u32 v9, v25, 6, 1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xf0, v25
+; GFX6-NEXT: v_bfe_u32 v25, v0, 4, 1
+; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:48
+; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:80
+; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:64
+; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:96
; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:32
; GFX6-NEXT: s_endpgm
;
@@ -5761,7 +5933,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_add_u32 s2, s0, 0x70
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: s_add_u32 s4, s0, 0x50
+; GFX8-NEXT: s_add_u32 s4, s0, 48
; GFX8-NEXT: s_addc_u32 s5, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v23, s5
; GFX8-NEXT: v_mov_b32_e32 v22, s4
@@ -5773,87 +5945,80 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v17, v2
; GFX8-NEXT: v_mov_b32_e32 v19, v2
; GFX8-NEXT: v_mov_b32_e32 v21, v2
-; GFX8-NEXT: v_mov_b32_e32 v25, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, 10, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 6, v0
+; GFX8-NEXT: v_and_b32_e32 v16, 0xf0, v0
; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, 7, v16
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[5:8]
; GFX8-NEXT: v_mov_b32_e32 v23, s3
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0
; GFX8-NEXT: v_mov_b32_e32 v22, s2
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0
+; GFX8-NEXT: s_add_u32 s2, s0, 16
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4]
+; GFX8-NEXT: v_mov_b32_e32 v23, s3
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v7, v2
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4]
-; GFX8-NEXT: v_mov_b32_e32 v23, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, 1
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 8, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0
+; GFX8-NEXT: v_mov_b32_e32 v22, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0x50
+; GFX8-NEXT: v_and_b32_e32 v8, 1, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v10, 3, v2
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_and_b32_sdwa v8, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 9, v0
-; GFX8-NEXT: s_add_u32 s2, s0, 0x60
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
+; GFX8-NEXT: v_mov_b32_e32 v23, s3
+; GFX8-NEXT: v_mov_b32_e32 v22, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 64
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v3
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NEXT: v_mov_b32_e32 v25, s3
+; GFX8-NEXT: v_and_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_mov_b32_e32 v24, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_lshrrev_b16_e32 v13, 1, v2
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e32 v6, 12, v0
-; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v0
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: v_and_b32_e32 v11, 1, v0
+; GFX8-NEXT: s_add_u32 s0, s0, 32
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, 10, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v0, 12, v0
+; GFX8-NEXT: v_and_b32_e32 v10, 8, v2
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[11:14]
; GFX8-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NEXT: v_and_b32_e32 v11, 1, v6
-; GFX8-NEXT: v_lshrrev_b16_e32 v6, 13, v0
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[11:14]
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v14, 4, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v16, 5, v0
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v14
+; GFX8-NEXT: v_and_b32_e32 v11, 2, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v0
+; GFX8-NEXT: v_and_b32_e32 v12, 2, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v0, 4, v16
; GFX8-NEXT: v_and_b32_e32 v14, 1, v6
-; GFX8-NEXT: v_lshrrev_b16_e32 v6, 3, v0
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
-; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 3, v10
+; GFX8-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NEXT: v_and_b32_e32 v18, 1, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v12
+; GFX8-NEXT: v_lshrrev_b16_e32 v20, 1, v11
+; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[14:17]
+; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[18:21]
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v16
-; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v3
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v0
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[14:17]
-; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[18:21]
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
-; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[22:25]
-; GFX8-NEXT: s_endpgm
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i1_to_v16i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @12
-; EG-NEXT: ALU 62, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 73, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T22.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T21.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T20.X, 0
@@ -5877,28 +6042,38 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; EG-NEXT: BFE_UINT T9.X, T7.X, literal.x, 1,
; EG-NEXT: MOV T9.Y, 0.0,
; EG-NEXT: BFE_UINT * T10.Z, T7.X, literal.y, 1,
-; EG-NEXT: 12(1.681558e-44), 11(1.541428e-44)
+; EG-NEXT: 12(1.681558e-44), 9(1.261169e-44)
; EG-NEXT: BFE_UINT T10.X, T7.X, literal.x, 1,
; EG-NEXT: MOV T10.Y, 0.0,
; EG-NEXT: BFE_UINT * T11.Z, T7.X, literal.y, 1,
-; EG-NEXT: 10(1.401298e-44), 9(1.261169e-44)
+; EG-NEXT: 8(1.121039e-44), 5(7.006492e-45)
; EG-NEXT: BFE_UINT T11.X, T7.X, literal.x, 1,
; EG-NEXT: MOV T11.Y, 0.0,
-; EG-NEXT: BFE_UINT * T12.Z, T7.X, literal.y, 1,
-; EG-NEXT: 8(1.121039e-44), 7(9.809089e-45)
-; EG-NEXT: BFE_UINT T12.X, T7.X, literal.x, 1,
-; EG-NEXT: MOV T12.Y, 0.0,
-; EG-NEXT: BFE_UINT * T13.Z, T7.X, literal.y, 1,
-; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45)
+; EG-NEXT: BFE_UINT T12.Z, T7.X, 1, 1,
+; EG-NEXT: AND_INT * T12.X, T7.X, 1,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T12.Y, 0.0,
; EG-NEXT: BFE_UINT T13.X, T7.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, T7.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T13.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T13.Y, 0.0,
-; EG-NEXT: BFE_UINT * T14.Z, T7.X, literal.y, 1,
-; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45)
+; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T14.X, T7.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 10(1.401298e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T14.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T14.Y, 0.0,
-; EG-NEXT: BFE_UINT T7.Z, T7.X, 1, 1,
-; EG-NEXT: AND_INT * T7.X, T7.X, 1,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T7.X, T7.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 6(8.407791e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T7.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T7.Y, 0.0,
; EG-NEXT: MOV T8.W, 0.0,
; EG-NEXT: MOV * T9.W, 0.0,
@@ -5908,21 +6083,22 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; EG-NEXT: MOV * T13.W, 0.0,
; EG-NEXT: MOV T14.W, 0.0,
; EG-NEXT: MOV * T7.W, 0.0,
-; EG-NEXT: LSHR T15.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T15.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T16.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T17.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT: LSHR T18.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT: LSHR * T18.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T19.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; EG-NEXT: LSHR T20.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
@@ -5939,56 +6115,57 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v28, 1, v0
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, v0
-; GFX12-NEXT: v_lshrrev_b16 v8, 9, v0
-; GFX12-NEXT: v_lshrrev_b16 v12, 13, v0
-; GFX12-NEXT: v_lshrrev_b16 v16, 7, v0
+; GFX12-NEXT: v_and_b32_e32 v4, 0xf0, v0
+; GFX12-NEXT: v_lshrrev_b16 v18, 8, v0
+; GFX12-NEXT: v_lshrrev_b16 v20, 12, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v8, 8, v0
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v16, 2, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v28, 8, v18
+; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v34, 2, v20
+; GFX12-NEXT: v_lshrrev_b16 v22, 10, v0
+; GFX12-NEXT: v_and_b32_e32 v33, 2, v18
+; GFX12-NEXT: v_lshrrev_b16 v35, 4, v4
+; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v24, 1, v20
+; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v20, 1, v18
+; GFX12-NEXT: v_mov_b32_e32 v17, v1
+; GFX12-NEXT: v_mov_b32_e32 v27, v1
; GFX12-NEXT: v_lshrrev_b16 v2, 15, v0
; GFX12-NEXT: v_lshrrev_b16 v6, 14, v0
-; GFX12-NEXT: v_lshrrev_b16 v10, 10, v0
-; GFX12-NEXT: v_lshrrev_b16 v20, 5, v0
-; GFX12-NEXT: v_lshrrev_b16 v24, 3, v0
-; GFX12-NEXT: v_lshrrev_b16 v32, 1, v0
-; GFX12-NEXT: v_and_b32_e32 v33, 1, v4
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v34, 1, v8
-; GFX12-NEXT: v_lshrrev_b16 v14, 8, v0
-; GFX12-NEXT: v_lshrrev_b16 v18, 12, v0
-; GFX12-NEXT: v_and_b32_e32 v35, 1, v12
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v36, 1, v16
-; GFX12-NEXT: v_lshrrev_b16 v22, 6, v0
-; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v38, 1, v24
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v32, 1, v32
+; GFX12-NEXT: v_lshrrev_b16 v10, 6, v0
+; GFX12-NEXT: v_lshrrev_b16 v14, 2, v0
+; GFX12-NEXT: v_lshrrev_b16 v30, 3, v8
+; GFX12-NEXT: v_lshrrev_b16 v32, 1, v16
+; GFX12-NEXT: v_and_b32_e32 v16, 1, v22
+; GFX12-NEXT: v_lshrrev_b16 v18, 3, v28
+; GFX12-NEXT: v_lshrrev_b16 v22, 1, v33
+; GFX12-NEXT: v_lshrrev_b16 v33, 1, v34
+; GFX12-NEXT: v_and_b32_e32 v34, 2, v35
+; GFX12-NEXT: v_lshrrev_b16 v26, 7, v4
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v12, 1, v0
+; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v8, 1, v14
; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v4, 1, v10
-; GFX12-NEXT: v_mov_b32_e32 v23, v1
+; GFX12-NEXT: v_dual_mov_b32 v23, v1 :: v_dual_and_b32 v0, 1, v6
; GFX12-NEXT: v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX12-NEXT: v_mov_b32_e32 v31, v1
-; GFX12-NEXT: v_lshrrev_b16 v26, 4, v0
-; GFX12-NEXT: v_lshrrev_b16 v30, 2, v0
-; GFX12-NEXT: v_and_b32_e32 v37, 1, v20
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v6
-; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v6, 0xffff, v33
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v24, 1, v30
-; GFX12-NEXT: v_and_b32_e32 v8, 1, v14
-; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v10, 0xffff, v34
-; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v20, 1, v26
-; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v16, 1, v22
-; GFX12-NEXT: v_and_b32_e32 v12, 1, v18
-; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v35
-; GFX12-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_and_b32 v18, 0xffff, v36
-; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v32
-; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v38
-; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v37
-; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:64
+; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v30
+; GFX12-NEXT: v_lshrrev_b16 v30, 1, v34
+; GFX12-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_and_b32 v18, 0xffff, v18
+; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v32
+; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v26
+; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v26, 0xffff, v33
+; GFX12-NEXT: v_and_b32_e32 v28, 1, v35
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1]
+; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:64
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6348,286 +6525,286 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_bfe_u32 s5, s4, 0x10001
-; GFX6-NEXT: s_bfe_u32 s6, s4, 0x10003
-; GFX6-NEXT: s_bfe_u32 s7, s4, 0x10005
-; GFX6-NEXT: s_bfe_u32 s8, s4, 0x10007
-; GFX6-NEXT: s_bfe_u32 s9, s4, 0x10009
-; GFX6-NEXT: s_bfe_u32 s10, s4, 0x1000b
-; GFX6-NEXT: s_bfe_u32 s11, s4, 0x1000d
-; GFX6-NEXT: s_bfe_u32 s12, s4, 0x1000f
-; GFX6-NEXT: s_bfe_u32 s13, s4, 0x10011
-; GFX6-NEXT: s_bfe_u32 s14, s4, 0x10013
-; GFX6-NEXT: s_bfe_u32 s15, s4, 0x10015
-; GFX6-NEXT: s_bfe_u32 s16, s4, 0x10017
-; GFX6-NEXT: s_bfe_u32 s17, s4, 0x10019
-; GFX6-NEXT: s_bfe_u32 s18, s4, 0x1001b
-; GFX6-NEXT: s_bfe_u32 s19, s4, 0x1001d
-; GFX6-NEXT: s_lshr_b32 s20, s4, 31
-; GFX6-NEXT: s_and_b32 s21, s4, 1
-; GFX6-NEXT: s_bfe_u32 s22, s4, 0x10002
-; GFX6-NEXT: s_bfe_u32 s23, s4, 0x10004
-; GFX6-NEXT: s_bfe_u32 s24, s4, 0x10006
-; GFX6-NEXT: s_bfe_u32 s25, s4, 0x10008
-; GFX6-NEXT: s_bfe_u32 s26, s4, 0x1000a
-; GFX6-NEXT: s_bfe_u32 s27, s4, 0x1000c
+; GFX6-NEXT: s_lshr_b32 s5, s4, 16
+; GFX6-NEXT: s_bfe_u32 s6, s4, 0x40004
+; GFX6-NEXT: s_bfe_u32 s7, s4, 0x80008
+; GFX6-NEXT: s_bfe_u32 s8, s4, 0x4000c
+; GFX6-NEXT: s_lshr_b32 s9, s4, 24
+; GFX6-NEXT: s_lshr_b32 s10, s4, 28
+; GFX6-NEXT: s_bfe_u32 s11, s4, 0x10001
+; GFX6-NEXT: s_bfe_u32 s12, s4, 0x10003
+; GFX6-NEXT: s_bfe_u32 s13, s4, 0x10007
+; GFX6-NEXT: s_bfe_u32 s14, s4, 0x1000f
+; GFX6-NEXT: s_lshr_b32 s15, s4, 31
+; GFX6-NEXT: s_and_b32 s16, s4, 0xff00
+; GFX6-NEXT: s_and_b32 s17, s4, 0xf0
+; GFX6-NEXT: s_bfe_u32 s18, s4, 0x1000a
+; GFX6-NEXT: s_bfe_u32 s19, s4, 0x10010
+; GFX6-NEXT: s_bfe_u32 s20, s4, 0x10012
+; GFX6-NEXT: s_bfe_u32 s21, s4, 0x10016
+; GFX6-NEXT: s_bfe_u32 s22, s4, 0x10018
+; GFX6-NEXT: s_bfe_u32 s23, s4, 0x1001a
+; GFX6-NEXT: s_bfe_u32 s24, s4, 0x1001c
+; GFX6-NEXT: s_and_b32 s25, s4, 1
+; GFX6-NEXT: s_bfe_u32 s26, s4, 0x10002
+; GFX6-NEXT: s_bfe_u32 s27, s4, 0x10006
; GFX6-NEXT: s_bfe_u32 s28, s4, 0x1000e
-; GFX6-NEXT: s_bfe_u32 s29, s4, 0x10010
-; GFX6-NEXT: s_bfe_u32 s30, s4, 0x10012
-; GFX6-NEXT: s_bfe_u32 s31, s4, 0x10014
-; GFX6-NEXT: s_bfe_u32 s33, s4, 0x10016
-; GFX6-NEXT: s_bfe_u32 s34, s4, 0x10018
-; GFX6-NEXT: s_bfe_u32 s35, s4, 0x1001a
-; GFX6-NEXT: s_bfe_u32 s36, s4, 0x1001e
-; GFX6-NEXT: s_bfe_u32 s4, s4, 0x1001c
-; GFX6-NEXT: v_mov_b32_e32 v0, s36
-; GFX6-NEXT: v_mov_b32_e32 v2, s20
+; GFX6-NEXT: s_bfe_u32 s4, s4, 0x1001e
+; GFX6-NEXT: s_bfe_u32 s29, s5, 0x40004
+; GFX6-NEXT: s_bfe_u32 s6, s6, 0x10001
+; GFX6-NEXT: s_bfe_u32 s30, s7, 0x10001
+; GFX6-NEXT: s_bfe_u32 s7, s7, 0x10003
+; GFX6-NEXT: s_bfe_u32 s8, s8, 0x10001
+; GFX6-NEXT: s_bfe_u32 s31, s5, 0x10001
+; GFX6-NEXT: s_bfe_u32 s33, s5, 0x10003
+; GFX6-NEXT: s_bfe_u32 s34, s5, 0x10007
+; GFX6-NEXT: s_bfe_u32 s35, s9, 0x10001
+; GFX6-NEXT: s_bfe_u32 s9, s9, 0x10003
+; GFX6-NEXT: s_bfe_u32 s10, s10, 0x10001
+; GFX6-NEXT: s_and_b32 s5, s5, 0xf0
+; GFX6-NEXT: s_bfe_u32 s17, s17, 0x10004
+; GFX6-NEXT: s_bfe_u32 s36, s16, 0x10008
+; GFX6-NEXT: s_bfe_u32 s16, s16, 0x1000c
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v2, s15
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GFX6-NEXT: s_bfe_u32 s4, s29, 0x10001
+; GFX6-NEXT: s_bfe_u32 s5, s5, 0x10004
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: v_mov_b32_e32 v2, s19
+; GFX6-NEXT: v_mov_b32_e32 v0, s28
+; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s27
+; GFX6-NEXT: v_mov_b32_e32 v2, s13
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s26
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s25
+; GFX6-NEXT: v_mov_b32_e32 v2, s11
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s24
+; GFX6-NEXT: v_mov_b32_e32 v2, s10
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s35
-; GFX6-NEXT: v_mov_b32_e32 v2, s18
+; GFX6-NEXT: v_mov_b32_e32 v0, s23
+; GFX6-NEXT: v_mov_b32_e32 v2, s9
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s34
-; GFX6-NEXT: v_mov_b32_e32 v2, s17
+; GFX6-NEXT: v_mov_b32_e32 v0, s22
+; GFX6-NEXT: v_mov_b32_e32 v2, s35
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s33
-; GFX6-NEXT: v_mov_b32_e32 v2, s16
+; GFX6-NEXT: v_mov_b32_e32 v0, s21
+; GFX6-NEXT: v_mov_b32_e32 v2, s34
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s31
-; GFX6-NEXT: v_mov_b32_e32 v2, s15
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s30
-; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s20
+; GFX6-NEXT: v_mov_b32_e32 v2, s33
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s29
-; GFX6-NEXT: v_mov_b32_e32 v2, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s19
+; GFX6-NEXT: v_mov_b32_e32 v2, s31
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s28
-; GFX6-NEXT: v_mov_b32_e32 v2, s12
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s27
-; GFX6-NEXT: v_mov_b32_e32 v2, s11
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s26
-; GFX6-NEXT: v_mov_b32_e32 v2, s10
+; GFX6-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NEXT: v_mov_b32_e32 v2, s7
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s25
-; GFX6-NEXT: v_mov_b32_e32 v2, s9
+; GFX6-NEXT: v_mov_b32_e32 v0, s36
+; GFX6-NEXT: v_mov_b32_e32 v2, s30
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s24
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s23
-; GFX6-NEXT: v_mov_b32_e32 v2, s7
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s22
+; GFX6-NEXT: v_mov_b32_e32 v0, s17
; GFX6-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s21
-; GFX6-NEXT: v_mov_b32_e32 v2, s5
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_zextload_v32i1_to_v32i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v24, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v15, v1
+; GFX8-NEXT: v_mov_b32_e32 v17, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s2
-; GFX8-NEXT: v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2
-; GFX8-NEXT: v_and_b32_e32 v8, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s2
-; GFX8-NEXT: v_and_b32_e32 v11, 1, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
-; GFX8-NEXT: s_lshr_b32 s14, s2, 24
-; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2
-; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10018
-; GFX8-NEXT: s_and_b32 s11, s2, 1
-; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s18, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s21, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s22, s2, 0x10017
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s2
+; GFX8-NEXT: s_lshr_b32 s2, s4, 24
+; GFX8-NEXT: s_lshr_b32 s7, s4, 16
+; GFX8-NEXT: s_bfe_u32 s5, s4, 0x10010
+; GFX8-NEXT: s_bfe_u32 s6, s4, 0x10012
+; GFX8-NEXT: s_bfe_u32 s18, s4, 0x10016
+; GFX8-NEXT: s_bfe_u32 s19, s4, 0x10018
+; GFX8-NEXT: s_and_b32 s20, s4, 1
; GFX8-NEXT: v_lshrrev_b16_e64 v3, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 4, s2
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 15, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xb0
+; GFX8-NEXT: v_and_b32_e64 v20, s2, 8
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 2, s2
+; GFX8-NEXT: v_and_b32_e64 v11, s2, 2
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 7, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0x70
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: s_add_u32 s4, s0, 0xa0
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: s_add_u32 s6, s0, 0x90
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NEXT: s_add_u32 s8, s0, 0x80
+; GFX8-NEXT: s_add_u32 s8, s0, 0xf0
+; GFX8-NEXT: v_and_b32_e32 v14, 1, v3
; GFX8-NEXT: s_addc_u32 s9, s1, 0
-; GFX8-NEXT: s_add_u32 s12, s0, 0x70
-; GFX8-NEXT: v_and_b32_e32 v16, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s14
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v17, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s14
-; GFX8-NEXT: v_mov_b32_e32 v23, s13
-; GFX8-NEXT: v_and_b32_e32 v25, 1, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0xf0
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v4
-; GFX8-NEXT: v_mov_b32_e32 v19, v1
-; GFX8-NEXT: v_mov_b32_e32 v21, v1
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT: v_mov_b32_e32 v23, s13
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s14
-; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0x60
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v6
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 7, s14
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s14
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v14
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX8-NEXT: v_mov_b32_e32 v15, s13
-; GFX8-NEXT: v_mov_b32_e32 v14, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0x50
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v23, s13
-; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 64
-; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[18:21]
+; GFX8-NEXT: v_mov_b32_e32 v3, s8
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
+; GFX8-NEXT: s_add_u32 s8, s0, 48
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s4
+; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[14:17]
+; GFX8-NEXT: v_and_b32_e32 v25, s4, v24
+; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s8
+; GFX8-NEXT: v_and_b32_e32 v14, 1, v6
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 7, v25
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
+; GFX8-NEXT: s_add_u32 s8, s0, 16
+; GFX8-NEXT: v_and_b32_e64 v18, s4, 8
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s4
+; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[14:17]
+; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s8
+; GFX8-NEXT: v_and_b32_e32 v14, 1, v19
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 3, v18
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
+; GFX8-NEXT: s_add_u32 s8, s0, 0xd0
+; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[14:17]
+; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s8
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
+; GFX8-NEXT: s_add_u32 s8, s0, 0xc0
+; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_add_u32 s10, s0, 0xb0
+; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_add_u32 s12, s0, 0x90
; GFX8-NEXT: s_addc_u32 s13, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v9
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v11
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v10
-; GFX8-NEXT: v_mov_b32_e32 v10, 1
-; GFX8-NEXT: v_mov_b32_e32 v23, s13
-; GFX8-NEXT: v_and_b32_sdwa v18, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v2
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v7
-; GFX8-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NEXT: v_and_b32_e32 v21, 1, v3
-; GFX8-NEXT: v_mov_b32_e32 v0, s21
-; GFX8-NEXT: v_mov_b32_e32 v2, s22
+; GFX8-NEXT: s_add_u32 s14, s0, 0x80
+; GFX8-NEXT: v_and_b32_e32 v24, s7, v24
+; GFX8-NEXT: s_addc_u32 s15, s1, 0
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, 4, v24
+; GFX8-NEXT: s_add_u32 s16, s0, 0x60
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 12, s4
+; GFX8-NEXT: v_and_b32_e32 v18, 2, v6
+; GFX8-NEXT: v_and_b32_e32 v14, 1, v21
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 3, v20
+; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: v_and_b32_e32 v23, 2, v22
+; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[14:17]
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v6
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v18
+; GFX8-NEXT: v_mov_b32_e32 v19, s17
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 14, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 8, s4
+; GFX8-NEXT: v_and_b32_e32 v14, 1, v22
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 1, v23
+; GFX8-NEXT: v_mov_b32_e32 v18, s16
+; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: v_and_b32_e32 v13, 2, v12
+; GFX8-NEXT: v_and_b32_e32 v26, 2, v2
+; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[14:17]
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_and_b32_e32 v17, 1, v2
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 15, s4
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, s2
-; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v8, s5
+; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_and_b32_e64 v9, s4, 2
+; GFX8-NEXT: v_and_b32_e32 v8, 8, v12
+; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[0:3]
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v12
+; GFX8-NEXT: v_lshrrev_b16_e32 v22, 1, v13
+; GFX8-NEXT: v_lshrrev_b16_e32 v13, 1, v11
+; GFX8-NEXT: v_mov_b32_e32 v12, s1
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v9
+; GFX8-NEXT: v_mov_b32_e32 v0, s20
+; GFX8-NEXT: v_mov_b32_e32 v11, s0
+; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v12, s9
+; GFX8-NEXT: v_and_b32_e64 v7, s7, 2
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 10, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s19
-; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: v_mov_b32_e32 v7, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, v13
+; GFX8-NEXT: v_mov_b32_e32 v11, s8
+; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[0:3]
+; GFX8-NEXT: v_and_b32_e32 v9, 1, v10
+; GFX8-NEXT: v_lshrrev_b16_e32 v11, 3, v8
+; GFX8-NEXT: v_lshrrev_b16_e32 v10, 1, v7
+; GFX8-NEXT: v_mov_b32_e32 v7, s10
+; GFX8-NEXT: v_and_b32_e64 v5, s7, 8
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 7, v24
+; GFX8-NEXT: v_mov_b32_e32 v0, s18
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 3, v5
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v8, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s17
-; GFX8-NEXT: v_mov_b32_e32 v2, s18
-; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v7, s12
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: v_mov_b32_e32 v8, s13
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v7, s8
-; GFX8-NEXT: v_mov_b32_e32 v0, s16
-; GFX8-NEXT: v_mov_b32_e32 v2, s15
-; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NEXT: v_mov_b32_e32 v7, s14
+; GFX8-NEXT: v_mov_b32_e32 v0, s5
+; GFX8-NEXT: v_mov_b32_e32 v2, v10
+; GFX8-NEXT: v_mov_b32_e32 v8, s15
+; GFX8-NEXT: s_add_u32 s2, s0, 0x50
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_and_b32_e32 v15, 1, v24
-; GFX8-NEXT: v_mov_b32_e32 v22, v1
-; GFX8-NEXT: v_mov_b32_e32 v24, v1
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v12, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
+; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v5
; GFX8-NEXT: v_mov_b32_e32 v21, v1
+; GFX8-NEXT: v_mov_b32_e32 v23, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[18:21]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v16
-; GFX8-NEXT: v_and_b32_e32 v9, 1, v12
-; GFX8-NEXT: v_mov_b32_e32 v10, v1
-; GFX8-NEXT: v_mov_b32_e32 v12, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_mov_b32_e32 v8, s1
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12]
-; GFX8-NEXT: v_mov_b32_e32 v0, s11
-; GFX8-NEXT: v_mov_b32_e32 v2, v14
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, s0
; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
-; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[20:23]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xd0
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s14
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX8-NEXT: v_mov_b32_e32 v16, v1
-; GFX8-NEXT: v_mov_b32_e32 v18, v1
+; GFX8-NEXT: s_add_u32 s2, s0, 32
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s14
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18]
+; GFX8-NEXT: v_lshrrev_b16_e32 v25, 4, v25
+; GFX8-NEXT: v_lshrrev_b16_e32 v19, 1, v26
+; GFX8-NEXT: v_mov_b32_e32 v18, v1
+; GFX8-NEXT: v_mov_b32_e32 v20, v1
+; GFX8-NEXT: s_add_u32 s0, s0, 0xa0
+; GFX8-NEXT: v_and_b32_e32 v27, 2, v25
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
-; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v26
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v25
+; GFX8-NEXT: v_mov_b32_e32 v17, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, v13
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_and_b32_e32 v14, 1, v25
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 1, v27
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17]
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v32i1_to_v32i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @22
-; EG-NEXT: ALU 96, @25, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 30, @122, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 101, @25, KC0[], KC1[]
+; EG-NEXT: ALU 51, @127, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T42.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T41.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T40.X, 0
@@ -6660,60 +6837,86 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; EG-NEXT: BFE_UINT T13.X, T11.X, literal.x, 1,
; EG-NEXT: MOV T13.Y, 0.0,
; EG-NEXT: BFE_UINT * T14.Z, T11.X, literal.y, 1,
-; EG-NEXT: 28(3.923636e-44), 27(3.783506e-44)
+; EG-NEXT: 28(3.923636e-44), 25(3.503246e-44)
; EG-NEXT: BFE_UINT T14.X, T11.X, literal.x, 1,
; EG-NEXT: MOV T14.Y, 0.0,
; EG-NEXT: BFE_UINT * T15.Z, T11.X, literal.y, 1,
-; EG-NEXT: 26(3.643376e-44), 25(3.503246e-44)
+; EG-NEXT: 24(3.363116e-44), 21(2.942727e-44)
; EG-NEXT: BFE_UINT T15.X, T11.X, literal.x, 1,
; EG-NEXT: MOV T15.Y, 0.0,
; EG-NEXT: BFE_UINT * T16.Z, T11.X, literal.y, 1,
-; EG-NEXT: 24(3.363116e-44), 23(3.222986e-44)
+; EG-NEXT: 20(2.802597e-44), 17(2.382207e-44)
; EG-NEXT: BFE_UINT T16.X, T11.X, literal.x, 1,
; EG-NEXT: MOV T16.Y, 0.0,
; EG-NEXT: BFE_UINT * T17.Z, T11.X, literal.y, 1,
-; EG-NEXT: 22(3.082857e-44), 21(2.942727e-44)
+; EG-NEXT: 16(2.242078e-44), 13(1.821688e-44)
; EG-NEXT: BFE_UINT T17.X, T11.X, literal.x, 1,
; EG-NEXT: MOV T17.Y, 0.0,
; EG-NEXT: BFE_UINT * T18.Z, T11.X, literal.y, 1,
-; EG-NEXT: 20(2.802597e-44), 19(2.662467e-44)
+; EG-NEXT: 12(1.681558e-44), 9(1.261169e-44)
; EG-NEXT: BFE_UINT T18.X, T11.X, literal.x, 1,
; EG-NEXT: MOV T18.Y, 0.0,
; EG-NEXT: BFE_UINT * T19.Z, T11.X, literal.y, 1,
-; EG-NEXT: 18(2.522337e-44), 17(2.382207e-44)
+; EG-NEXT: 8(1.121039e-44), 5(7.006492e-45)
; EG-NEXT: BFE_UINT T19.X, T11.X, literal.x, 1,
; EG-NEXT: MOV T19.Y, 0.0,
-; EG-NEXT: BFE_UINT * T20.Z, T11.X, literal.y, 1,
-; EG-NEXT: 16(2.242078e-44), 15(2.101948e-44)
-; EG-NEXT: BFE_UINT T20.X, T11.X, literal.x, 1,
-; EG-NEXT: MOV T20.Y, 0.0,
-; EG-NEXT: BFE_UINT * T21.Z, T11.X, literal.y, 1,
-; EG-NEXT: 14(1.961818e-44), 13(1.821688e-44)
+; EG-NEXT: BFE_UINT T20.Z, T11.X, 1, 1,
+; EG-NEXT: AND_INT * T20.X, T11.X, 1,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T20.Y, 0.0,
; EG-NEXT: BFE_UINT T21.X, T11.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, T11.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T21.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T21.Y, 0.0,
-; EG-NEXT: BFE_UINT * T22.Z, T11.X, literal.y, 1,
-; EG-NEXT: 12(1.681558e-44), 11(1.541428e-44)
+; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T22.X, T11.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 26(3.643376e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T22.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T22.Y, 0.0,
-; EG-NEXT: BFE_UINT * T23.Z, T11.X, literal.y, 1,
-; EG-NEXT: 10(1.401298e-44), 9(1.261169e-44)
+; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T23.X, T11.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 22(3.082857e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T23.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T23.Y, 0.0,
-; EG-NEXT: BFE_UINT * T24.Z, T11.X, literal.y, 1,
-; EG-NEXT: 8(1.121039e-44), 7(9.809089e-45)
+; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T24.X, T11.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 18(2.522337e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T24.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T24.Y, 0.0,
-; EG-NEXT: BFE_UINT * T25.Z, T11.X, literal.y, 1,
-; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45)
+; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T25.X, T11.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 14(1.961818e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T25.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T25.Y, 0.0,
-; EG-NEXT: BFE_UINT * T26.Z, T11.X, literal.y, 1,
-; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45)
+; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T26.X, T11.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 10(1.401298e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T26.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T26.Y, 0.0,
-; EG-NEXT: BFE_UINT T11.Z, T11.X, 1, 1,
-; EG-NEXT: AND_INT * T11.X, T11.X, 1,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T11.X, T11.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 6(8.407791e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T11.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T11.Y, 0.0,
; EG-NEXT: MOV T12.W, 0.0,
; EG-NEXT: MOV * T13.W, 0.0,
@@ -6727,51 +6930,51 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; EG-NEXT: MOV * T21.W, 0.0,
; EG-NEXT: MOV T22.W, 0.0,
; EG-NEXT: MOV * T23.W, 0.0,
+; EG-NEXT: ALU clause starting at 127:
; EG-NEXT: MOV T24.W, 0.0,
; EG-NEXT: MOV * T25.W, 0.0,
; EG-NEXT: MOV T26.W, 0.0,
; EG-NEXT: MOV * T11.W, 0.0,
-; EG-NEXT: LSHR T27.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T27.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T28.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR T29.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
+; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; EG-NEXT: LSHR T30.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; EG-NEXT: LSHR T31.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT: LSHR * T32.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
+; EG-NEXT: LSHR T32.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: LSHR T33.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T34.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 122:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T33.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT: LSHR T34.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T35.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
+; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; EG-NEXT: LSHR T36.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
+; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; EG-NEXT: LSHR T37.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
+; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
; EG-NEXT: LSHR T38.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
+; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
; EG-NEXT: LSHR T39.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
+; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
; EG-NEXT: LSHR T40.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43)
@@ -6787,109 +6990,119 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v0, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2
; GFX12-NEXT: s_lshr_b32 s3, s2, 24
+; GFX12-NEXT: s_lshr_b32 s4, s2, 16
+; GFX12-NEXT: v_and_b32_e64 v3, 0xf0, s2
+; GFX12-NEXT: v_and_b32_e64 v6, s2, 2
+; GFX12-NEXT: v_lshrrev_b16 v9, 12, s2
+; GFX12-NEXT: v_and_b32_e64 v14, s3, 8
+; GFX12-NEXT: v_and_b32_e64 v18, s4, 8
+; GFX12-NEXT: v_and_b32_e64 v19, s4, 2
+; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_lshrrev_b16 v2, 12, s2
+; GFX12-NEXT: v_and_b32_e64 v4, s2, 8
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 10, s2
+; GFX12-NEXT: v_and_b32_e64 v16, 0xf0, s4
+; GFX12-NEXT: v_lshrrev_b16 v21, 7, v3
+; GFX12-NEXT: v_lshrrev_b16 v25, 1, v6
+; GFX12-NEXT: v_and_b32_e32 v6, 2, v9
+; GFX12-NEXT: v_lshrrev_b16 v3, 4, v3
+; GFX12-NEXT: v_and_b32_e32 v12, 1, v9
+; GFX12-NEXT: v_lshrrev_b16 v9, 3, v14
+; GFX12-NEXT: v_lshrrev_b16 v14, 3, v18
+; GFX12-NEXT: v_lshrrev_b16 v18, 1, v19
+; GFX12-NEXT: v_and_b32_e32 v19, 2, v20
+; GFX12-NEXT: v_lshrrev_b16 v2, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v0, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v23, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v22, 8, v8
+; GFX12-NEXT: v_and_b32_e32 v24, 2, v8
+; GFX12-NEXT: v_and_b32_e32 v4, 1, v8
+; GFX12-NEXT: v_and_b32_e32 v8, 1, v10
+; GFX12-NEXT: v_lshrrev_b16 v10, 7, v16
+; GFX12-NEXT: v_lshrrev_b16 v26, 4, v16
+; GFX12-NEXT: v_and_b32_e32 v16, 1, v20
+; GFX12-NEXT: v_and_b32_e32 v28, 2, v3
+; GFX12-NEXT: v_and_b32_e32 v20, 1, v3
+; GFX12-NEXT: v_lshrrev_b16 v3, 1, v19
+; GFX12-NEXT: v_lshrrev_b16 v5, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v7, 2, s2
+; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10010
+; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012
+; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v11, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v13, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v17, 2, s3
+; GFX12-NEXT: v_and_b32_e64 v15, s3, 2
+; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v19, 2, v26
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10018
+; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: v_and_b32_e32 v39, 0xffff, v18
+; GFX12-NEXT: v_and_b32_e32 v18, 0xffff, v3
+; GFX12-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_and_b32 v25, 0xffff, v25
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v23, 1, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
-; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v11, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
-; GFX12-NEXT: v_and_b32_e32 v24, 1, v4
-; GFX12-NEXT: v_and_b32_e32 v25, 1, v8
-; GFX12-NEXT: v_and_b32_e32 v28, 1, v21
-; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2
-; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0
-; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10015
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v15, 1, s2
-; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX12-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v6, 10, s2
-; GFX12-NEXT: v_and_b32_e32 v26, 1, v15
-; GFX12-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_and_b32 v15, 1, v9
-; GFX12-NEXT: v_and_b32_e32 v9, 1, v17
-; GFX12-NEXT: v_and_b32_e32 v29, 1, v23
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 2, s2
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s5, s2, 1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010
-; GFX12-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_and_b32 v19, 1, v6
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13
-; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1
-; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26
-; GFX12-NEXT: v_and_b32_e32 v4, 1, v14
-; GFX12-NEXT: v_and_b32_e32 v8, 1, v12
-; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v29
-; GFX12-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_and_b32 v35, 1, v18
-; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v16
-; GFX12-NEXT: v_and_b32_e32 v39, 1, v7
-; GFX12-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_and_b32 v41, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
-; GFX12-NEXT: v_mov_b32_e32 v5, v1
-; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_and_b32 v29, 0xffff, v9
-; GFX12-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_and_b32 v23, 1, v22
-; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v27, 1, v20
-; GFX12-NEXT: v_mov_b32_e32 v20, v1
-; GFX12-NEXT: v_mov_b32_e32 v22, v1
-; GFX12-NEXT: v_mov_b32_e32 v18, v1
-; GFX12-NEXT: v_and_b32_e32 v12, 1, v10
-; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v25
-; GFX12-NEXT: v_mov_b32_e32 v24, v1
-; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v1, v[39:42], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v1, v[19:22], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v1, v[15:18], s[0:1] offset:64
-; GFX12-NEXT: v_mov_b32_e32 v15, v1
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshrrev_b16 v15, 1, v15
+; GFX12-NEXT: v_lshrrev_b16 v27, 1, v6
+; GFX12-NEXT: v_lshrrev_b16 v22, 3, v22
+; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v10
+; GFX12-NEXT: v_lshrrev_b16 v19, 1, v19
+; GFX12-NEXT: v_and_b32_e32 v31, 1, v13
+; GFX12-NEXT: v_mov_b32_e32 v13, v1
+; GFX12-NEXT: v_lshrrev_b16 v6, 1, v24
+; GFX12-NEXT: v_and_b32_e32 v24, 1, v26
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX12-NEXT: v_dual_mov_b32 v2, v25 :: v_dual_and_b32 v33, 0xffff, v11
; GFX12-NEXT: v_mov_b32_e32 v11, v1
+; GFX12-NEXT: v_lshrrev_b16 v26, 1, v28
+; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v14
+; GFX12-NEXT: v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v10, 0xffff, v22
+; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v27
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v26
+; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v19
+; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v19, 0xffff, v15
+; GFX12-NEXT: v_and_b32_e32 v27, 1, v7
+; GFX12-NEXT: v_dual_mov_b32 v28, v1 :: v_dual_and_b32 v29, 0xffff, v23
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v2, v44 :: v_dual_mov_b32 v9, v1
-; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v25, 0xffff, v28
-; GFX12-NEXT: v_mov_b32_e32 v28, v1
-; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:208
+; GFX12-NEXT: v_mov_b32_e32 v2, v19
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:240
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_and_b32 v27, 1, v17
+; GFX12-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_and_b32 v37, 0xffff, v21
+; GFX12-NEXT: v_mov_b32_e32 v21, v1
+; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v9
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: v_mov_b32_e32 v2, v36
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v15, v1
+; GFX12-NEXT: v_mov_b32_e32 v17, v1
+; GFX12-NEXT: v_mov_b32_e32 v19, v1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:208
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, v39 :: v_dual_and_b32 v35, 1, v5
+; GFX12-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v9, v1
+; GFX12-NEXT: v_dual_mov_b32 v23, v1 :: v_dual_and_b32 v6, 0xffff, v6
+; GFX12-NEXT: v_mov_b32_e32 v5, v1
+; GFX12-NEXT: v_mov_b32_e32 v7, v1
+; GFX12-NEXT: v_mov_b32_e32 v27, v1
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:224
+; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:160
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7584,562 +7797,563 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10003
-; GFX6-NEXT: s_bfe_u32 s5, s2, 0x10005
-; GFX6-NEXT: s_bfe_u32 s8, s2, 0x10007
-; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10009
-; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000b
-; GFX6-NEXT: s_bfe_u32 s15, s2, 0x1000d
-; GFX6-NEXT: s_bfe_u32 s17, s2, 0x1000f
-; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10011
-; GFX6-NEXT: s_bfe_u32 s21, s2, 0x10013
-; GFX6-NEXT: s_bfe_u32 s23, s2, 0x10015
-; GFX6-NEXT: s_bfe_u32 s25, s2, 0x10017
-; GFX6-NEXT: s_bfe_u32 s27, s2, 0x10019
-; GFX6-NEXT: s_bfe_u32 s29, s2, 0x1001b
-; GFX6-NEXT: s_bfe_u32 s31, s2, 0x1001d
-; GFX6-NEXT: s_lshr_b32 s34, s2, 31
-; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10003
-; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10005
-; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10007
-; GFX6-NEXT: s_bfe_u32 s39, s3, 0x10009
-; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000b
-; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000d
-; GFX6-NEXT: s_bfe_u32 s42, s3, 0x1000f
-; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10011
-; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10013
-; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10015
-; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10017
-; GFX6-NEXT: s_bfe_u32 s47, s3, 0x10019
-; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001b
-; GFX6-NEXT: s_bfe_u32 s49, s3, 0x1001d
-; GFX6-NEXT: s_lshr_b32 s50, s3, 31
-; GFX6-NEXT: s_bfe_u32 s9, s3, 0x10001
-; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10001
-; GFX6-NEXT: s_and_b32 s7, s2, 1
-; GFX6-NEXT: s_and_b32 s10, s3, 1
-; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10002
-; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10004
-; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10006
-; GFX6-NEXT: s_bfe_u32 s18, s2, 0x10008
-; GFX6-NEXT: s_bfe_u32 s20, s2, 0x1000a
-; GFX6-NEXT: s_bfe_u32 s22, s2, 0x1000c
-; GFX6-NEXT: s_bfe_u32 s24, s2, 0x1000e
-; GFX6-NEXT: s_bfe_u32 s26, s2, 0x10010
-; GFX6-NEXT: s_bfe_u32 s28, s2, 0x10012
-; GFX6-NEXT: s_bfe_u32 s30, s2, 0x10014
-; GFX6-NEXT: s_bfe_u32 s33, s2, 0x10016
-; GFX6-NEXT: s_bfe_u32 s35, s2, 0x10018
-; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001a
-; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c
-; GFX6-NEXT: s_bfe_u32 s53, s2, 0x1001e
-; GFX6-NEXT: s_bfe_u32 s54, s3, 0x10002
-; GFX6-NEXT: s_bfe_u32 s55, s3, 0x10004
-; GFX6-NEXT: s_bfe_u32 s56, s3, 0x10006
-; GFX6-NEXT: s_bfe_u32 s57, s3, 0x10008
-; GFX6-NEXT: s_bfe_u32 s58, s3, 0x1000a
-; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000c
-; GFX6-NEXT: s_bfe_u32 s60, s3, 0x1000e
-; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10010
-; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10012
-; GFX6-NEXT: s_bfe_u32 s63, s3, 0x10014
-; GFX6-NEXT: s_bfe_u32 s64, s3, 0x10016
-; GFX6-NEXT: s_bfe_u32 s65, s3, 0x10018
-; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001a
-; GFX6-NEXT: s_bfe_u32 s67, s3, 0x1001e
-; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1001c
+; GFX6-NEXT: s_lshr_b32 s22, s2, 16
+; GFX6-NEXT: s_lshr_b32 s23, s3, 16
+; GFX6-NEXT: s_bfe_u32 s11, s2, 0x40004
+; GFX6-NEXT: s_bfe_u32 s15, s2, 0x80008
+; GFX6-NEXT: s_bfe_u32 s17, s2, 0x4000c
+; GFX6-NEXT: s_lshr_b32 s26, s2, 24
+; GFX6-NEXT: s_lshr_b32 s27, s2, 28
+; GFX6-NEXT: s_bfe_u32 s28, s3, 0x40004
+; GFX6-NEXT: s_bfe_u32 s29, s3, 0x80008
+; GFX6-NEXT: s_bfe_u32 s30, s3, 0x4000c
+; GFX6-NEXT: s_lshr_b32 s31, s3, 24
+; GFX6-NEXT: s_lshr_b32 s33, s3, 28
+; GFX6-NEXT: s_bfe_u32 s34, s2, 0x10003
+; GFX6-NEXT: s_bfe_u32 s35, s2, 0x10007
+; GFX6-NEXT: s_bfe_u32 s36, s2, 0x1000f
+; GFX6-NEXT: s_lshr_b32 s37, s2, 31
+; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10003
+; GFX6-NEXT: s_bfe_u32 s20, s3, 0x10007
+; GFX6-NEXT: s_bfe_u32 s13, s3, 0x1000f
+; GFX6-NEXT: s_lshr_b32 s19, s3, 31
+; GFX6-NEXT: s_and_b32 s39, s3, 0xff00
+; GFX6-NEXT: s_and_b32 s40, s3, 0xf0
+; GFX6-NEXT: s_and_b32 s41, s2, 0xff00
+; GFX6-NEXT: s_and_b32 s42, s2, 0xf0
+; GFX6-NEXT: s_bfe_u32 s6, s3, 0x10001
+; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10001
+; GFX6-NEXT: s_and_b32 s5, s2, 1
+; GFX6-NEXT: s_and_b32 s7, s3, 1
+; GFX6-NEXT: s_bfe_u32 s8, s2, 0x1000a
+; GFX6-NEXT: s_bfe_u32 s9, s2, 0x10010
+; GFX6-NEXT: s_bfe_u32 s10, s2, 0x10012
+; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10016
+; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10018
+; GFX6-NEXT: s_bfe_u32 s16, s2, 0x1001a
+; GFX6-NEXT: s_bfe_u32 s18, s2, 0x1001c
+; GFX6-NEXT: s_bfe_u32 s21, s3, 0x1000a
+; GFX6-NEXT: s_bfe_u32 s25, s3, 0x10010
+; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10012
+; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10016
+; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10018
+; GFX6-NEXT: s_bfe_u32 s46, s3, 0x1001a
+; GFX6-NEXT: s_bfe_u32 s47, s3, 0x1001c
+; GFX6-NEXT: s_bfe_u32 s48, s2, 0x10002
+; GFX6-NEXT: s_bfe_u32 s49, s2, 0x10006
+; GFX6-NEXT: s_bfe_u32 s50, s2, 0x1000e
+; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001e
+; GFX6-NEXT: s_bfe_u32 s24, s3, 0x10002
+; GFX6-NEXT: s_bfe_u32 s52, s3, 0x10006
+; GFX6-NEXT: s_bfe_u32 s2, s3, 0x1001e
+; GFX6-NEXT: s_bfe_u32 s53, s3, 0x1000e
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v0, s67
-; GFX6-NEXT: v_mov_b32_e32 v2, s50
+; GFX6-NEXT: v_mov_b32_e32 v2, s19
+; GFX6-NEXT: s_bfe_u32 s54, s22, 0x40004
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s68
-; GFX6-NEXT: v_mov_b32_e32 v2, s49
+; GFX6-NEXT: v_mov_b32_e32 v0, s53
+; GFX6-NEXT: s_bfe_u32 s53, s23, 0x40004
+; GFX6-NEXT: s_bfe_u32 s11, s11, 0x10001
+; GFX6-NEXT: v_mov_b32_e32 v2, s13
+; GFX6-NEXT: s_bfe_u32 s13, s15, 0x10001
+; GFX6-NEXT: s_bfe_u32 s15, s15, 0x10003
+; GFX6-NEXT: s_bfe_u32 s17, s17, 0x10001
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s52
+; GFX6-NEXT: s_bfe_u32 s19, s22, 0x10001
+; GFX6-NEXT: v_mov_b32_e32 v2, s20
+; GFX6-NEXT: s_bfe_u32 s20, s22, 0x10003
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s24
+; GFX6-NEXT: s_bfe_u32 s24, s22, 0x10007
+; GFX6-NEXT: v_mov_b32_e32 v2, s38
+; GFX6-NEXT: s_bfe_u32 s38, s26, 0x10001
+; GFX6-NEXT: s_bfe_u32 s52, s26, 0x10003
+; GFX6-NEXT: s_bfe_u32 s27, s27, 0x10001
+; GFX6-NEXT: s_bfe_u32 s28, s28, 0x10001
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s51
+; GFX6-NEXT: s_bfe_u32 s51, s29, 0x10001
+; GFX6-NEXT: s_bfe_u32 s29, s29, 0x10003
+; GFX6-NEXT: s_bfe_u32 s30, s30, 0x10001
+; GFX6-NEXT: v_mov_b32_e32 v2, s37
+; GFX6-NEXT: s_bfe_u32 s37, s23, 0x10001
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s50
+; GFX6-NEXT: s_bfe_u32 s50, s23, 0x10003
+; GFX6-NEXT: v_mov_b32_e32 v2, s36
+; GFX6-NEXT: s_bfe_u32 s36, s23, 0x10007
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s49
+; GFX6-NEXT: s_bfe_u32 s49, s31, 0x10001
+; GFX6-NEXT: s_bfe_u32 s31, s31, 0x10003
+; GFX6-NEXT: s_bfe_u32 s33, s33, 0x10001
+; GFX6-NEXT: s_and_b32 s55, s23, 0xf0
+; GFX6-NEXT: s_and_b32 s23, s22, 0xf0
+; GFX6-NEXT: s_bfe_u32 s26, s42, 0x10004
+; GFX6-NEXT: v_mov_b32_e32 v2, s35
+; GFX6-NEXT: s_bfe_u32 s35, s41, 0x10008
+; GFX6-NEXT: s_bfe_u32 s41, s41, 0x1000c
+; GFX6-NEXT: s_bfe_u32 s40, s40, 0x10004
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s48
+; GFX6-NEXT: s_bfe_u32 s42, s39, 0x10008
+; GFX6-NEXT: s_bfe_u32 s39, s39, 0x1000c
+; GFX6-NEXT: s_bfe_u32 s22, s54, 0x10001
+; GFX6-NEXT: s_bfe_u32 s48, s53, 0x10001
+; GFX6-NEXT: s_bfe_u32 s23, s23, 0x10004
+; GFX6-NEXT: s_bfe_u32 s53, s55, 0x10004
+; GFX6-NEXT: v_mov_b32_e32 v2, s34
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s47
+; GFX6-NEXT: v_mov_b32_e32 v2, s33
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:480
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s66
-; GFX6-NEXT: v_mov_b32_e32 v2, s48
+; GFX6-NEXT: v_mov_b32_e32 v0, s46
+; GFX6-NEXT: v_mov_b32_e32 v2, s31
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s65
-; GFX6-NEXT: v_mov_b32_e32 v2, s47
+; GFX6-NEXT: v_mov_b32_e32 v0, s45
+; GFX6-NEXT: v_mov_b32_e32 v2, s49
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s64
-; GFX6-NEXT: v_mov_b32_e32 v2, s46
+; GFX6-NEXT: v_mov_b32_e32 v0, s44
+; GFX6-NEXT: v_mov_b32_e32 v2, s36
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s63
-; GFX6-NEXT: v_mov_b32_e32 v2, s45
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s62
-; GFX6-NEXT: v_mov_b32_e32 v2, s44
+; GFX6-NEXT: v_mov_b32_e32 v0, s43
+; GFX6-NEXT: v_mov_b32_e32 v2, s50
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s61
-; GFX6-NEXT: v_mov_b32_e32 v2, s43
+; GFX6-NEXT: v_mov_b32_e32 v0, s25
+; GFX6-NEXT: v_mov_b32_e32 v2, s37
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s60
-; GFX6-NEXT: v_mov_b32_e32 v2, s42
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s59
-; GFX6-NEXT: v_mov_b32_e32 v2, s41
+; GFX6-NEXT: v_mov_b32_e32 v0, s39
+; GFX6-NEXT: v_mov_b32_e32 v2, s30
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s58
-; GFX6-NEXT: v_mov_b32_e32 v2, s40
+; GFX6-NEXT: v_mov_b32_e32 v0, s21
+; GFX6-NEXT: v_mov_b32_e32 v2, s29
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s57
-; GFX6-NEXT: v_mov_b32_e32 v2, s39
+; GFX6-NEXT: v_mov_b32_e32 v0, s42
+; GFX6-NEXT: v_mov_b32_e32 v2, s51
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s56
-; GFX6-NEXT: v_mov_b32_e32 v2, s38
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s55
-; GFX6-NEXT: v_mov_b32_e32 v2, s37
+; GFX6-NEXT: v_mov_b32_e32 v0, s40
+; GFX6-NEXT: v_mov_b32_e32 v2, s28
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s54
-; GFX6-NEXT: v_mov_b32_e32 v2, s36
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s53
-; GFX6-NEXT: v_mov_b32_e32 v2, s34
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s52
-; GFX6-NEXT: v_mov_b32_e32 v2, s31
+; GFX6-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NEXT: v_mov_b32_e32 v2, s27
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s51
-; GFX6-NEXT: v_mov_b32_e32 v2, s29
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mov_b32_e32 v2, s52
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s35
-; GFX6-NEXT: v_mov_b32_e32 v2, s27
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v2, s38
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s33
-; GFX6-NEXT: v_mov_b32_e32 v2, s25
+; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mov_b32_e32 v2, s24
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s30
-; GFX6-NEXT: v_mov_b32_e32 v2, s23
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s28
-; GFX6-NEXT: v_mov_b32_e32 v2, s21
+; GFX6-NEXT: v_mov_b32_e32 v0, s10
+; GFX6-NEXT: v_mov_b32_e32 v2, s20
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s26
+; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s19
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s24
+; GFX6-NEXT: v_mov_b32_e32 v0, s41
; GFX6-NEXT: v_mov_b32_e32 v2, s17
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s22
-; GFX6-NEXT: v_mov_b32_e32 v2, s15
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s20
-; GFX6-NEXT: v_mov_b32_e32 v2, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: v_mov_b32_e32 v2, s15
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
-; GFX6-NEXT: v_mov_b32_e32 v2, s11
+; GFX6-NEXT: v_mov_b32_e32 v0, s35
+; GFX6-NEXT: v_mov_b32_e32 v2, s13
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s26
+; GFX6-NEXT: v_mov_b32_e32 v2, s11
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s10
-; GFX6-NEXT: v_mov_b32_e32 v2, s9
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256
+; GFX6-NEXT: v_mov_b32_e32 v0, s53
+; GFX6-NEXT: v_mov_b32_e32 v2, s48
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s7
; GFX6-NEXT: v_mov_b32_e32 v2, s6
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s23
+; GFX6-NEXT: v_mov_b32_e32 v2, s22
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_zextload_v64i1_to_v64i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, v1
+; GFX8-NEXT: v_mov_b32_e32 v23, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xf0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2
-; GFX8-NEXT: v_and_b32_e32 v18, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s2
-; GFX8-NEXT: v_and_b32_e32 v16, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s2
-; GFX8-NEXT: v_and_b32_e32 v15, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2
-; GFX8-NEXT: v_and_b32_e32 v13, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2
-; GFX8-NEXT: s_lshr_b32 s33, s3, 24
-; GFX8-NEXT: s_lshr_b32 s24, s2, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2
-; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2
-; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10018
-; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10018
-; GFX8-NEXT: s_and_b32 s22, s3, 1
-; GFX8-NEXT: s_and_b32 s23, s2, 1
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s2
-; GFX8-NEXT: s_bfe_u32 s25, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s31, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10017
-; GFX8-NEXT: s_bfe_u32 s34, s3, 0x10011
-; GFX8-NEXT: s_bfe_u32 s35, s3, 0x10010
-; GFX8-NEXT: s_bfe_u32 s36, s3, 0x10012
-; GFX8-NEXT: s_bfe_u32 s37, s3, 0x10013
-; GFX8-NEXT: s_bfe_u32 s38, s3, 0x10016
-; GFX8-NEXT: s_bfe_u32 s39, s3, 0x10017
-; GFX8-NEXT: s_bfe_u32 s40, s3, 0x10015
-; GFX8-NEXT: s_bfe_u32 s41, s3, 0x10014
-; GFX8-NEXT: s_add_u32 s4, s0, 0x1a0
+; GFX8-NEXT: s_lshr_b32 s18, s3, 24
+; GFX8-NEXT: s_lshr_b32 s17, s2, 24
+; GFX8-NEXT: s_lshr_b32 s16, s3, 16
+; GFX8-NEXT: s_lshr_b32 s28, s2, 16
+; GFX8-NEXT: s_bfe_u32 s24, s2, 0x10010
+; GFX8-NEXT: s_bfe_u32 s25, s2, 0x10012
+; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10016
+; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10018
+; GFX8-NEXT: s_bfe_u32 s29, s3, 0x10010
+; GFX8-NEXT: s_bfe_u32 s30, s3, 0x10012
+; GFX8-NEXT: s_bfe_u32 s31, s3, 0x10016
+; GFX8-NEXT: s_bfe_u32 s33, s3, 0x10018
+; GFX8-NEXT: s_and_b32 s34, s3, 1
+; GFX8-NEXT: s_and_b32 s35, s2, 1
+; GFX8-NEXT: s_add_u32 s4, s0, 0x70
; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1b0
+; GFX8-NEXT: s_add_u32 s6, s0, 0x170
+; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v25, s7
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 14, s3
+; GFX8-NEXT: v_mov_b32_e32 v24, s6
+; GFX8-NEXT: s_add_u32 s6, s0, 48
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v18
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s3
+; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
+; GFX8-NEXT: v_mov_b32_e32 v24, s7
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 6, s2
+; GFX8-NEXT: v_and_b32_e32 v18, s2, v3
+; GFX8-NEXT: v_mov_b32_e32 v23, s6
+; GFX8-NEXT: s_add_u32 s6, s0, 16
+; GFX8-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX8-NEXT: v_lshrrev_b16_e32 v21, 7, v18
+; GFX8-NEXT: v_mov_b32_e32 v20, v1
+; GFX8-NEXT: v_mov_b32_e32 v22, v1
+; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[19:22]
+; GFX8-NEXT: v_mov_b32_e32 v25, s7
+; GFX8-NEXT: v_and_b32_e64 v7, s2, 8
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 2, s2
+; GFX8-NEXT: v_mov_b32_e32 v24, s6
+; GFX8-NEXT: s_add_u32 s6, s0, 0x1f0
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v17
+; GFX8-NEXT: v_lshrrev_b16_e32 v22, 3, v7
+; GFX8-NEXT: v_mov_b32_e32 v21, v1
+; GFX8-NEXT: v_mov_b32_e32 v23, v1
+; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
+; GFX8-NEXT: v_mov_b32_e32 v25, s7
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 6, s18
+; GFX8-NEXT: v_mov_b32_e32 v24, s6
+; GFX8-NEXT: s_add_u32 s6, s0, 0x130
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 6, s3
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v9
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s18
+; GFX8-NEXT: v_mov_b32_e32 v23, v1
+; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
+; GFX8-NEXT: v_and_b32_e32 v17, s3, v3
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v8
+; GFX8-NEXT: v_mov_b32_e32 v9, s7
+; GFX8-NEXT: v_mov_b32_e32 v8, s6
+; GFX8-NEXT: s_add_u32 s6, s0, 0x110
+; GFX8-NEXT: v_and_b32_e64 v15, s3, 8
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 2, s3
+; GFX8-NEXT: v_lshrrev_b16_e32 v22, 7, v17
+; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[20:23]
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s17
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v16
+; GFX8-NEXT: v_lshrrev_b16_e32 v22, 3, v15
+; GFX8-NEXT: v_mov_b32_e32 v16, s7
+; GFX8-NEXT: v_mov_b32_e32 v15, s6
+; GFX8-NEXT: s_add_u32 s6, s0, 0x100
; GFX8-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NEXT: s_add_u32 s8, s0, 0x190
+; GFX8-NEXT: s_add_u32 s8, s0, 0xf0
+; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[20:23]
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 12, s2
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v14
+; GFX8-NEXT: v_mov_b32_e32 v15, s9
+; GFX8-NEXT: v_mov_b32_e32 v14, s8
+; GFX8-NEXT: s_add_u32 s8, s0, 0x60
+; GFX8-NEXT: v_and_b32_e32 v13, 2, v12
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s17
+; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[20:23]
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s2
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v12
+; GFX8-NEXT: v_lshrrev_b16_e32 v22, 1, v13
+; GFX8-NEXT: v_mov_b32_e32 v13, s9
+; GFX8-NEXT: v_mov_b32_e32 v12, s8
+; GFX8-NEXT: s_add_u32 s8, s0, 0x50
+; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[20:23]
+; GFX8-NEXT: v_and_b32_e32 v10, 8, v4
+; GFX8-NEXT: v_mov_b32_e32 v21, s9
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 10, s2
+; GFX8-NEXT: v_mov_b32_e32 v20, s8
+; GFX8-NEXT: s_add_u32 s8, s0, 64
+; GFX8-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX8-NEXT: v_lshrrev_b16_e32 v13, 3, v10
+; GFX8-NEXT: v_mov_b32_e32 v12, v1
+; GFX8-NEXT: v_mov_b32_e32 v14, v1
+; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[11:14]
+; GFX8-NEXT: v_mov_b32_e32 v21, s9
+; GFX8-NEXT: v_and_b32_e32 v6, 2, v4
+; GFX8-NEXT: v_mov_b32_e32 v20, s8
+; GFX8-NEXT: s_add_u32 s8, s0, 0x1d0
+; GFX8-NEXT: v_and_b32_e32 v10, 1, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v12, 1, v6
+; GFX8-NEXT: v_mov_b32_e32 v11, v1
+; GFX8-NEXT: v_mov_b32_e32 v13, v1
+; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[10:13]
+; GFX8-NEXT: v_and_b32_e64 v19, s18, 8
+; GFX8-NEXT: v_mov_b32_e32 v13, s9
+; GFX8-NEXT: v_mov_b32_e32 v12, s8
+; GFX8-NEXT: s_add_u32 s8, s0, 0x1c0
; GFX8-NEXT: s_addc_u32 s9, s1, 0
-; GFX8-NEXT: s_add_u32 s10, s0, 0x180
+; GFX8-NEXT: s_add_u32 s10, s0, 0x1b0
; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: s_add_u32 s12, s0, 0xb0
+; GFX8-NEXT: s_add_u32 s12, s0, 0x190
; GFX8-NEXT: s_addc_u32 s13, s1, 0
-; GFX8-NEXT: s_add_u32 s14, s0, 0xa0
+; GFX8-NEXT: s_add_u32 s14, s0, 0x180
+; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s18
; GFX8-NEXT: s_addc_u32 s15, s1, 0
-; GFX8-NEXT: s_add_u32 s16, s0, 0x90
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 12, s3
+; GFX8-NEXT: v_and_b32_e32 v20, 1, v26
+; GFX8-NEXT: v_lshrrev_b16_e32 v22, 3, v19
+; GFX8-NEXT: v_mov_b32_e32 v21, v1
+; GFX8-NEXT: s_add_u32 s20, s0, 0x160
+; GFX8-NEXT: v_and_b32_e32 v24, 2, v16
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[20:23]
+; GFX8-NEXT: s_addc_u32 s21, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v13, s20
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, 4, v18
+; GFX8-NEXT: v_and_b32_e32 v18, 1, v16
+; GFX8-NEXT: v_lshrrev_b16_e32 v20, 1, v24
+; GFX8-NEXT: v_mov_b32_e32 v19, v1
+; GFX8-NEXT: v_mov_b32_e32 v14, s21
+; GFX8-NEXT: v_and_b32_e64 v7, s18, 2
+; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[18:21]
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s18
+; GFX8-NEXT: s_add_u32 s18, s0, 0x150
+; GFX8-NEXT: s_addc_u32 s19, s1, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 8, s3
+; GFX8-NEXT: v_mov_b32_e32 v23, s19
+; GFX8-NEXT: v_and_b32_e32 v25, 8, v15
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 10, s3
+; GFX8-NEXT: v_mov_b32_e32 v22, s18
+; GFX8-NEXT: s_add_u32 s18, s0, 0x140
+; GFX8-NEXT: v_and_b32_e32 v18, 1, v27
+; GFX8-NEXT: v_lshrrev_b16_e32 v20, 3, v25
+; GFX8-NEXT: s_addc_u32 s19, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
+; GFX8-NEXT: v_mov_b32_e32 v23, s19
+; GFX8-NEXT: v_mov_b32_e32 v22, s18
+; GFX8-NEXT: s_add_u32 s18, s0, 0xd0
+; GFX8-NEXT: v_and_b32_e32 v28, 2, v15
+; GFX8-NEXT: s_addc_u32 s19, s1, 0
+; GFX8-NEXT: v_and_b32_e64 v9, s16, 8
+; GFX8-NEXT: v_and_b32_e64 v8, s16, 2
+; GFX8-NEXT: v_and_b32_e32 v18, 1, v15
+; GFX8-NEXT: v_lshrrev_b16_e32 v20, 1, v28
+; GFX8-NEXT: v_and_b32_e32 v26, s16, v3
+; GFX8-NEXT: s_add_u32 s16, s0, 0xc0
+; GFX8-NEXT: v_and_b32_e64 v29, s17, 8
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 2, s17
+; GFX8-NEXT: v_and_b32_e64 v11, s17, 2
+; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 4, s17
+; GFX8-NEXT: v_mov_b32_e32 v23, s19
; GFX8-NEXT: s_addc_u32 s17, s1, 0
-; GFX8-NEXT: s_add_u32 s18, s0, 0x80
+; GFX8-NEXT: v_mov_b32_e32 v22, s18
+; GFX8-NEXT: s_add_u32 s18, s0, 0xb0
; GFX8-NEXT: s_addc_u32 s19, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s3
-; GFX8-NEXT: s_add_u32 s42, s0, 0x70
-; GFX8-NEXT: v_and_b32_e32 v7, 1, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v23, s42
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v24, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x170
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s3
-; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[2:5]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s42
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 15, s3
-; GFX8-NEXT: v_mov_b32_e32 v23, v1
-; GFX8-NEXT: v_mov_b32_e32 v25, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[22:25]
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s3
-; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 6, s33
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s42
-; GFX8-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 7, s33
-; GFX8-NEXT: v_mov_b32_e32 v22, v1
-; GFX8-NEXT: v_mov_b32_e32 v24, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0xf0
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s24
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s42
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v20
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 7, s24
-; GFX8-NEXT: v_mov_b32_e32 v23, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x60
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v19
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX8-NEXT: v_mov_b32_e32 v18, s42
-; GFX8-NEXT: v_mov_b32_e32 v19, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x50
-; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v17
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v16
-; GFX8-NEXT: v_mov_b32_e32 v16, s42
-; GFX8-NEXT: v_mov_b32_e32 v17, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 64
-; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[22:25]
-; GFX8-NEXT: v_mov_b32_e32 v17, 1
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v26, s42
-; GFX8-NEXT: v_and_b32_sdwa v22, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v15
-; GFX8-NEXT: v_mov_b32_e32 v27, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 48
-; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v14
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v13
-; GFX8-NEXT: v_mov_b32_e32 v13, s42
-; GFX8-NEXT: v_mov_b32_e32 v14, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 32
-; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v11
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v10
-; GFX8-NEXT: v_mov_b32_e32 v10, s42
-; GFX8-NEXT: v_mov_b32_e32 v11, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 16
-; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v9
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v8
-; GFX8-NEXT: v_mov_b32_e32 v8, s42
-; GFX8-NEXT: v_mov_b32_e32 v9, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x160
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v10, 3, s33
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[22:25]
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s33
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v22, s42
-; GFX8-NEXT: v_and_b32_e32 v28, 1, v10
-; GFX8-NEXT: v_and_b32_e32 v19, 1, v8
-; GFX8-NEXT: v_and_b32_e32 v8, 1, v5
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v7
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v11, v1
-; GFX8-NEXT: v_mov_b32_e32 v23, s43
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s24
-; GFX8-NEXT: s_add_u32 s42, s0, 0x150
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s3
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
-; GFX8-NEXT: v_and_b32_e32 v22, 1, v5
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v4
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_and_b32_e32 v7, 1, v21
-; GFX8-NEXT: v_mov_b32_e32 v8, v1
-; GFX8-NEXT: v_mov_b32_e32 v10, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s43
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[7:10]
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s3
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v4
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s24
-; GFX8-NEXT: s_add_u32 s42, s0, 0x140
-; GFX8-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NEXT: v_and_b32_e32 v20, 1, v2
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v8, s42
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s3
-; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v4
-; GFX8-NEXT: v_and_b32_sdwa v4, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v20
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v9, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x130
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s3
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v7, 1, v18
-; GFX8-NEXT: v_mov_b32_e32 v17, s42
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v10
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX8-NEXT: v_mov_b32_e32 v8, v1
-; GFX8-NEXT: v_mov_b32_e32 v10, v1
-; GFX8-NEXT: v_mov_b32_e32 v18, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x120
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s3
-; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[7:10]
-; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v3
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s42
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 3, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 1, s3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v19
-; GFX8-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX8-NEXT: v_mov_b32_e32 v17, v1
-; GFX8-NEXT: v_mov_b32_e32 v19, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x110
-; GFX8-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s3
-; GFX8-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s42
+; GFX8-NEXT: s_add_u32 s20, s0, 0x90
+; GFX8-NEXT: s_addc_u32 s21, s1, 0
+; GFX8-NEXT: s_add_u32 s22, s0, 0x80
+; GFX8-NEXT: s_addc_u32 s23, s1, 0
+; GFX8-NEXT: v_and_b32_e32 v18, 1, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v20, 3, v29
+; GFX8-NEXT: s_add_u32 s36, s0, 32
+; GFX8-NEXT: v_and_b32_e32 v24, 2, v6
+; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
+; GFX8-NEXT: s_addc_u32 s37, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v22, s36
+; GFX8-NEXT: v_and_b32_e32 v18, 1, v6
+; GFX8-NEXT: v_lshrrev_b16_e32 v20, 1, v24
+; GFX8-NEXT: v_mov_b32_e32 v23, s37
+; GFX8-NEXT: v_and_b32_e32 v28, s28, v3
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 14, s2
+; GFX8-NEXT: v_and_b32_e64 v2, s2, 2
+; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 4, v28
+; GFX8-NEXT: v_mov_b32_e32 v19, s5
+; GFX8-NEXT: v_and_b32_e32 v6, 2, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v3
+; GFX8-NEXT: v_lshrrev_b16_e32 v21, 1, v2
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v17, 1, v15
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v12
-; GFX8-NEXT: v_mov_b32_e32 v18, v1
-; GFX8-NEXT: v_mov_b32_e32 v20, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v13
-; GFX8-NEXT: v_mov_b32_e32 v13, s5
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
-; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s41
-; GFX8-NEXT: v_mov_b32_e32 v2, s40
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 15, s2
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v12, s4
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s38
-; GFX8-NEXT: v_mov_b32_e32 v2, s39
-; GFX8-NEXT: v_mov_b32_e32 v12, s6
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s9
-; GFX8-NEXT: v_mov_b32_e32 v0, s36
-; GFX8-NEXT: v_mov_b32_e32 v2, s37
-; GFX8-NEXT: v_mov_b32_e32 v12, s8
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s11
+; GFX8-NEXT: v_mov_b32_e32 v18, s4
+; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_and_b32_e64 v5, s3, 2
; GFX8-NEXT: v_mov_b32_e32 v0, s35
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v12, s10
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v12, s12
+; GFX8-NEXT: v_mov_b32_e32 v2, v21
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: v_mov_b32_e32 v22, s7
+; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v21, s6
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, s34
+; GFX8-NEXT: v_and_b32_e32 v25, 2, v15
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7
+; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v22, s9
+; GFX8-NEXT: v_mov_b32_e32 v0, s33
+; GFX8-NEXT: v_mov_b32_e32 v2, v7
+; GFX8-NEXT: v_mov_b32_e32 v21, s8
+; GFX8-NEXT: v_lshrrev_b16_e32 v23, 1, v25
+; GFX8-NEXT: v_mov_b32_e32 v25, s11
+; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v24, s10
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 7, v26
; GFX8-NEXT: v_mov_b32_e32 v0, s31
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v13, s13
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v12, s14
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 4, v17
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 3, v9
+; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v25, s13
+; GFX8-NEXT: v_and_b32_e32 v17, 2, v16
+; GFX8-NEXT: v_and_b32_e32 v21, 1, v15
+; GFX8-NEXT: v_mov_b32_e32 v0, s30
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: v_mov_b32_e32 v24, s12
+; GFX8-NEXT: v_and_b32_e32 v7, 1, v16
+; GFX8-NEXT: v_mov_b32_e32 v16, s15
+; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v15, s14
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v8
; GFX8-NEXT: v_mov_b32_e32 v0, s29
-; GFX8-NEXT: v_mov_b32_e32 v2, s30
-; GFX8-NEXT: v_mov_b32_e32 v13, s15
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v12, s16
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v11
+; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v15, s16
+; GFX8-NEXT: v_and_b32_e32 v14, 2, v13
; GFX8-NEXT: v_mov_b32_e32 v0, s27
-; GFX8-NEXT: v_mov_b32_e32 v2, s28
-; GFX8-NEXT: v_mov_b32_e32 v13, s17
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v12, s18
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: v_mov_b32_e32 v16, s17
+; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3]
+; GFX8-NEXT: v_and_b32_e32 v11, 1, v13
+; GFX8-NEXT: v_lshrrev_b16_e32 v13, 1, v14
+; GFX8-NEXT: v_mov_b32_e32 v14, s18
+; GFX8-NEXT: v_and_b32_e64 v10, s28, 8
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 7, v28
; GFX8-NEXT: v_mov_b32_e32 v0, s26
-; GFX8-NEXT: v_mov_b32_e32 v2, s25
-; GFX8-NEXT: v_mov_b32_e32 v13, s19
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
-; GFX8-NEXT: s_add_u32 s2, s0, 0x100
-; GFX8-NEXT: v_mov_b32_e32 v0, s23
-; GFX8-NEXT: v_mov_b32_e32 v2, v10
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s33
-; GFX8-NEXT: v_mov_b32_e32 v0, s22
+; GFX8-NEXT: v_mov_b32_e32 v15, s19
+; GFX8-NEXT: v_lshrrev_b16_e32 v8, 3, v10
+; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v14, s20
+; GFX8-NEXT: v_and_b32_e64 v12, s28, 2
+; GFX8-NEXT: v_mov_b32_e32 v0, s25
; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: v_mov_b32_e32 v15, s21
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v12
+; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v14, s22
+; GFX8-NEXT: v_mov_b32_e32 v0, s24
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: v_mov_b32_e32 v15, s23
; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0
-; GFX8-NEXT: v_and_b32_e32 v26, 1, v14
-; GFX8-NEXT: v_lshrrev_b16_e64 v27, 4, s33
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_and_b32_e32 v17, 1, v27
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v26
+; GFX8-NEXT: v_mov_b32_e32 v12, v1
+; GFX8-NEXT: v_mov_b32_e32 v14, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
+; GFX8-NEXT: s_add_u32 s2, s0, 0x120
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[11:14]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v17
+; GFX8-NEXT: v_mov_b32_e32 v8, v1
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s33
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v28
-; GFX8-NEXT: v_mov_b32_e32 v15, v1
-; GFX8-NEXT: v_mov_b32_e32 v17, v1
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17]
-; GFX8-NEXT: v_mov_b32_e32 v0, s21
-; GFX8-NEXT: v_mov_b32_e32 v2, v5
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[7:10]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s24
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xd0
-; GFX8-NEXT: v_and_b32_e32 v7, 1, v23
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v22
-; GFX8-NEXT: v_mov_b32_e32 v8, v1
-; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: s_add_u32 s2, s0, 0x1a0
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 2, s24
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[7:10]
+; GFX8-NEXT: v_lshrrev_b16_e32 v27, 4, v26
+; GFX8-NEXT: v_mov_b32_e32 v22, v1
+; GFX8-NEXT: v_mov_b32_e32 v24, v1
+; GFX8-NEXT: s_add_u32 s0, s0, 0xa0
+; GFX8-NEXT: v_and_b32_e32 v20, 2, v27
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
+; GFX8-NEXT: v_mov_b32_e32 v19, v1
+; GFX8-NEXT: v_mov_b32_e32 v21, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v21
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, v11
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6
+; GFX8-NEXT: v_and_b32_e32 v18, 1, v27
+; GFX8-NEXT: v_lshrrev_b16_e32 v20, 1, v20
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[18:21]
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v64i1_to_v64i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @40, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @38
-; EG-NEXT: ALU 95, @41, KC0[], KC1[]
-; EG-NEXT: ALU 99, @137, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 60, @237, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @42, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @40
+; EG-NEXT: ALU 98, @43, KC0[], KC1[]
+; EG-NEXT: ALU 105, @142, KC0[], KC1[]
+; EG-NEXT: ALU 90, @248, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 12, @339, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T82.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T81.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T80.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T79.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T78.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T77.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T79.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T78.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T77.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T76.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T75.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T74.X, 0
@@ -8165,149 +8379,201 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T54.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T53.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T52.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T51.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T51.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 38:
-; EG-NEXT: VTX_READ_64 T25.XY, T19.X, 0, #1
-; EG-NEXT: ALU clause starting at 40:
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 40:
+; EG-NEXT: VTX_READ_64 T22.XY, T19.X, 0, #1
+; EG-NEXT: ALU clause starting at 42:
; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 41:
-; EG-NEXT: LSHR * T19.Z, T25.Y, literal.x,
+; EG-NEXT: ALU clause starting at 43:
+; EG-NEXT: LSHR * T19.Z, T22.Y, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T19.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T19.X, T22.Y, literal.x, 1,
; EG-NEXT: MOV T19.Y, 0.0,
-; EG-NEXT: BFE_UINT * T20.Z, T25.Y, literal.y, 1,
+; EG-NEXT: BFE_UINT * T20.Z, T22.Y, literal.y, 1,
; EG-NEXT: 30(4.203895e-44), 29(4.063766e-44)
-; EG-NEXT: BFE_UINT T20.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T20.X, T22.Y, literal.x, 1,
; EG-NEXT: MOV T20.Y, 0.0,
-; EG-NEXT: BFE_UINT * T21.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 28(3.923636e-44), 27(3.783506e-44)
-; EG-NEXT: BFE_UINT T21.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT * T21.Z, T22.Y, literal.y, 1,
+; EG-NEXT: 28(3.923636e-44), 25(3.503246e-44)
+; EG-NEXT: BFE_UINT T21.X, T22.Y, literal.x, 1,
; EG-NEXT: MOV T21.Y, 0.0,
-; EG-NEXT: BFE_UINT * T22.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 26(3.643376e-44), 25(3.503246e-44)
-; EG-NEXT: BFE_UINT T22.X, T25.Y, literal.x, 1,
-; EG-NEXT: MOV T22.Y, 0.0,
-; EG-NEXT: BFE_UINT * T23.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 24(3.363116e-44), 23(3.222986e-44)
-; EG-NEXT: BFE_UINT T23.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT * T23.Z, T22.Y, literal.y, 1,
+; EG-NEXT: 24(3.363116e-44), 21(2.942727e-44)
+; EG-NEXT: BFE_UINT T23.X, T22.Y, literal.x, 1,
; EG-NEXT: MOV T23.Y, 0.0,
-; EG-NEXT: BFE_UINT * T24.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 22(3.082857e-44), 21(2.942727e-44)
-; EG-NEXT: BFE_UINT T24.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT * T24.Z, T22.Y, literal.y, 1,
+; EG-NEXT: 20(2.802597e-44), 17(2.382207e-44)
+; EG-NEXT: BFE_UINT T24.X, T22.Y, literal.x, 1,
; EG-NEXT: MOV T24.Y, 0.0,
-; EG-NEXT: BFE_UINT * T26.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 20(2.802597e-44), 19(2.662467e-44)
-; EG-NEXT: BFE_UINT T26.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT * T25.Z, T22.Y, literal.y, 1,
+; EG-NEXT: 16(2.242078e-44), 13(1.821688e-44)
+; EG-NEXT: BFE_UINT T25.X, T22.Y, literal.x, 1,
+; EG-NEXT: MOV T25.Y, 0.0,
+; EG-NEXT: BFE_UINT * T26.Z, T22.Y, literal.y, 1,
+; EG-NEXT: 12(1.681558e-44), 9(1.261169e-44)
+; EG-NEXT: BFE_UINT T26.X, T22.Y, literal.x, 1,
; EG-NEXT: MOV T26.Y, 0.0,
-; EG-NEXT: BFE_UINT * T27.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 18(2.522337e-44), 17(2.382207e-44)
-; EG-NEXT: BFE_UINT T27.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT * T27.Z, T22.Y, literal.y, 1,
+; EG-NEXT: 8(1.121039e-44), 5(7.006492e-45)
+; EG-NEXT: BFE_UINT T27.X, T22.Y, literal.x, 1,
; EG-NEXT: MOV T27.Y, 0.0,
-; EG-NEXT: BFE_UINT * T28.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 16(2.242078e-44), 15(2.101948e-44)
-; EG-NEXT: BFE_UINT T28.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT T28.Z, T22.Y, 1, 1,
+; EG-NEXT: AND_INT * T28.X, T22.Y, 1,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: MOV T28.Y, 0.0,
-; EG-NEXT: BFE_UINT * T29.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 14(1.961818e-44), 13(1.821688e-44)
-; EG-NEXT: BFE_UINT T29.X, T25.Y, literal.x, 1,
+; EG-NEXT: LSHR * T29.Z, T22.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T29.X, T22.X, literal.x, 1,
; EG-NEXT: MOV T29.Y, 0.0,
-; EG-NEXT: BFE_UINT * T30.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 12(1.681558e-44), 11(1.541428e-44)
-; EG-NEXT: BFE_UINT T30.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT * T30.Z, T22.X, literal.y, 1,
+; EG-NEXT: 30(4.203895e-44), 29(4.063766e-44)
+; EG-NEXT: BFE_UINT T30.X, T22.X, literal.x, 1,
; EG-NEXT: MOV T30.Y, 0.0,
-; EG-NEXT: BFE_UINT * T31.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 10(1.401298e-44), 9(1.261169e-44)
-; EG-NEXT: BFE_UINT T31.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT * T31.Z, T22.X, literal.y, 1,
+; EG-NEXT: 28(3.923636e-44), 25(3.503246e-44)
+; EG-NEXT: BFE_UINT T31.X, T22.X, literal.x, 1,
; EG-NEXT: MOV T31.Y, 0.0,
-; EG-NEXT: BFE_UINT * T32.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 8(1.121039e-44), 7(9.809089e-45)
-; EG-NEXT: BFE_UINT T32.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT * T32.Z, T22.X, literal.y, 1,
+; EG-NEXT: 24(3.363116e-44), 21(2.942727e-44)
+; EG-NEXT: BFE_UINT T32.X, T22.X, literal.x, 1,
; EG-NEXT: MOV T32.Y, 0.0,
-; EG-NEXT: BFE_UINT * T33.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45)
-; EG-NEXT: BFE_UINT T33.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT * T33.Z, T22.X, literal.y, 1,
+; EG-NEXT: 20(2.802597e-44), 17(2.382207e-44)
+; EG-NEXT: BFE_UINT T33.X, T22.X, literal.x, 1,
; EG-NEXT: MOV T33.Y, 0.0,
-; EG-NEXT: BFE_UINT * T34.Z, T25.Y, literal.y, 1,
-; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45)
-; EG-NEXT: BFE_UINT T34.X, T25.Y, literal.x, 1,
+; EG-NEXT: BFE_UINT * T34.Z, T22.X, literal.y, 1,
+; EG-NEXT: 16(2.242078e-44), 13(1.821688e-44)
+; EG-NEXT: BFE_UINT T34.X, T22.X, literal.x, 1,
; EG-NEXT: MOV T34.Y, 0.0,
-; EG-NEXT: BFE_UINT T35.Z, T25.Y, 1, 1,
-; EG-NEXT: AND_INT * T35.X, T25.Y, 1,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT * T35.Z, T22.X, literal.y, 1,
+; EG-NEXT: 12(1.681558e-44), 9(1.261169e-44)
+; EG-NEXT: BFE_UINT T35.X, T22.X, literal.x, 1,
; EG-NEXT: MOV T35.Y, 0.0,
-; EG-NEXT: LSHR * T36.Z, T25.X, literal.x,
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T36.X, T25.X, literal.x, 1,
+; EG-NEXT: BFE_UINT * T36.Z, T22.X, literal.y, 1,
+; EG-NEXT: 8(1.121039e-44), 5(7.006492e-45)
+; EG-NEXT: BFE_UINT T36.X, T22.X, literal.x, 1,
; EG-NEXT: MOV T36.Y, 0.0,
-; EG-NEXT: BFE_UINT * T37.Z, T25.X, literal.y, 1,
-; EG-NEXT: 30(4.203895e-44), 29(4.063766e-44)
-; EG-NEXT: BFE_UINT T37.X, T25.X, literal.x, 1,
-; EG-NEXT: MOV T37.Y, 0.0,
-; EG-NEXT: BFE_UINT * T38.Z, T25.X, literal.y, 1,
-; EG-NEXT: 28(3.923636e-44), 27(3.783506e-44)
-; EG-NEXT: BFE_UINT T38.X, T25.X, literal.x, 1,
-; EG-NEXT: MOV T38.Y, 0.0,
-; EG-NEXT: BFE_UINT * T39.Z, T25.X, literal.y, 1,
-; EG-NEXT: 26(3.643376e-44), 25(3.503246e-44)
-; EG-NEXT: BFE_UINT T39.X, T25.X, literal.x, 1,
+; EG-NEXT: BFE_UINT T37.Z, T22.X, 1, 1,
+; EG-NEXT: AND_INT * T37.X, T22.X, 1,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T37.Y, 0.0,
+; EG-NEXT: BFE_UINT T38.X, T22.Y, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, T22.Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T38.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T38.Y, 0.0,
+; EG-NEXT: BFE_UINT T39.X, T22.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, T22.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T39.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T39.Y, 0.0,
-; EG-NEXT: BFE_UINT * T40.Z, T25.X, literal.y, 1,
-; EG-NEXT: 24(3.363116e-44), 23(3.222986e-44)
-; EG-NEXT: BFE_UINT T40.X, T25.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T22.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T40.X, T22.Y, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 26(3.643376e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T40.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T40.Y, 0.0,
-; EG-NEXT: BFE_UINT * T41.Z, T25.X, literal.y, 1,
-; EG-NEXT: 22(3.082857e-44), 21(2.942727e-44)
-; EG-NEXT: BFE_UINT T41.X, T25.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T22.Y, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T41.X, T22.Y, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 22(3.082857e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T41.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 142:
; EG-NEXT: MOV T41.Y, 0.0,
-; EG-NEXT: BFE_UINT * T42.Z, T25.X, literal.y, 1,
-; EG-NEXT: 20(2.802597e-44), 19(2.662467e-44)
-; EG-NEXT: BFE_UINT T42.X, T25.X, literal.x, 1,
-; EG-NEXT: MOV T42.Y, 0.0,
-; EG-NEXT: BFE_UINT * T43.Z, T25.X, literal.y, 1,
-; EG-NEXT: 18(2.522337e-44), 17(2.382207e-44)
-; EG-NEXT: BFE_UINT * T43.X, T25.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T22.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 137:
+; EG-NEXT: BFE_UINT T42.X, T22.Y, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 18(2.522337e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T42.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: MOV T42.Y, 0.0,
+; EG-NEXT: LSHR * T0.W, T22.Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T43.X, T22.Y, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 14(1.961818e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T43.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T43.Y, 0.0,
-; EG-NEXT: BFE_UINT * T44.Z, T25.X, literal.x, 1,
-; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T44.X, T25.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T22.Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T44.X, T22.Y, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 10(1.401298e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T44.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T44.Y, 0.0,
-; EG-NEXT: BFE_UINT * T45.Z, T25.X, literal.y, 1,
-; EG-NEXT: 14(1.961818e-44), 13(1.821688e-44)
-; EG-NEXT: BFE_UINT T45.X, T25.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T22.Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T45.X, T22.Y, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 6(8.407791e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T45.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T45.Y, 0.0,
-; EG-NEXT: BFE_UINT * T46.Z, T25.X, literal.y, 1,
-; EG-NEXT: 12(1.681558e-44), 11(1.541428e-44)
-; EG-NEXT: BFE_UINT T46.X, T25.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T22.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T46.X, T22.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 26(3.643376e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T46.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T46.Y, 0.0,
-; EG-NEXT: BFE_UINT * T47.Z, T25.X, literal.y, 1,
-; EG-NEXT: 10(1.401298e-44), 9(1.261169e-44)
-; EG-NEXT: BFE_UINT T47.X, T25.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T22.X, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T47.X, T22.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 22(3.082857e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T47.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T47.Y, 0.0,
-; EG-NEXT: BFE_UINT * T48.Z, T25.X, literal.y, 1,
-; EG-NEXT: 8(1.121039e-44), 7(9.809089e-45)
-; EG-NEXT: BFE_UINT T48.X, T25.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T22.X, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T48.X, T22.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 18(2.522337e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T48.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T48.Y, 0.0,
-; EG-NEXT: BFE_UINT * T49.Z, T25.X, literal.y, 1,
-; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45)
-; EG-NEXT: BFE_UINT T49.X, T25.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T22.X, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T49.X, T22.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 14(1.961818e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T49.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T49.Y, 0.0,
-; EG-NEXT: BFE_UINT * T50.Z, T25.X, literal.y, 1,
-; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45)
-; EG-NEXT: BFE_UINT T50.X, T25.X, literal.x, 1,
+; EG-NEXT: LSHR * T0.W, T22.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T50.X, T22.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 10(1.401298e-44), 8(1.121039e-44)
+; EG-NEXT: LSHR * T50.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: MOV T50.Y, 0.0,
-; EG-NEXT: BFE_UINT T25.Z, T25.X, 1, 1,
-; EG-NEXT: AND_INT * T25.X, T25.X, 1,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T25.Y, 0.0,
+; EG-NEXT: LSHR * T0.W, T22.X, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T22.X, T22.X, literal.x, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 6(8.407791e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR * T22.Z, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: MOV T22.Y, 0.0,
; EG-NEXT: MOV T19.W, 0.0,
; EG-NEXT: MOV * T20.W, 0.0,
; EG-NEXT: MOV T21.W, 0.0,
-; EG-NEXT: MOV * T22.W, 0.0,
-; EG-NEXT: MOV T23.W, 0.0,
-; EG-NEXT: MOV * T24.W, 0.0,
+; EG-NEXT: MOV * T23.W, 0.0,
+; EG-NEXT: MOV T24.W, 0.0,
+; EG-NEXT: MOV * T25.W, 0.0,
; EG-NEXT: MOV T26.W, 0.0,
; EG-NEXT: MOV * T27.W, 0.0,
; EG-NEXT: MOV T28.W, 0.0,
@@ -8326,103 +8592,105 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; EG-NEXT: MOV * T41.W, 0.0,
; EG-NEXT: MOV T42.W, 0.0,
; EG-NEXT: MOV * T43.W, 0.0,
-; EG-NEXT: MOV T44.W, 0.0,
-; EG-NEXT: MOV * T45.W, 0.0,
-; EG-NEXT: MOV T46.W, 0.0,
-; EG-NEXT: MOV * T47.W, 0.0,
-; EG-NEXT: MOV T48.W, 0.0,
-; EG-NEXT: MOV * T49.W, 0.0,
-; EG-NEXT: MOV T50.W, 0.0,
-; EG-NEXT: MOV * T25.W, 0.0,
-; EG-NEXT: LSHR T51.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T44.W, 0.0,
+; EG-NEXT: ALU clause starting at 248:
+; EG-NEXT: MOV T45.W, 0.0,
+; EG-NEXT: MOV * T46.W, 0.0,
+; EG-NEXT: MOV T47.W, 0.0,
+; EG-NEXT: MOV * T48.W, 0.0,
+; EG-NEXT: MOV T49.W, 0.0,
+; EG-NEXT: MOV * T50.W, 0.0,
+; EG-NEXT: MOV T22.W, 0.0,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T51.X, PS, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T52.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR T53.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
+; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; EG-NEXT: LSHR T54.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; EG-NEXT: LSHR T55.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
; EG-NEXT: LSHR T56.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
+; EG-NEXT: 2(2.802597e-45), 304(4.259947e-43)
; EG-NEXT: LSHR T57.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
+; EG-NEXT: 2(2.802597e-45), 336(4.708363e-43)
; EG-NEXT: LSHR T58.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
+; EG-NEXT: 2(2.802597e-45), 368(5.156778e-43)
; EG-NEXT: LSHR T59.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
+; EG-NEXT: 2(2.802597e-45), 400(5.605194e-43)
; EG-NEXT: LSHR T60.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
+; EG-NEXT: 2(2.802597e-45), 432(6.053609e-43)
; EG-NEXT: LSHR T61.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT: LSHR * T62.X, PV.W, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 237:
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00)
+; EG-NEXT: 2(2.802597e-45), 464(6.502025e-43)
+; EG-NEXT: LSHR T62.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T63.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
+; EG-NEXT: 2(2.802597e-45), 272(3.811532e-43)
; EG-NEXT: LSHR T64.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT: LSHR T65.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
+; EG-NEXT: LSHR * T65.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T66.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 256(3.587324e-43)
+; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; EG-NEXT: LSHR T67.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 272(3.811532e-43)
+; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; EG-NEXT: LSHR T68.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 288(4.035740e-43)
+; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
; EG-NEXT: LSHR T69.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 304(4.259947e-43)
+; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
; EG-NEXT: LSHR T70.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 320(4.484155e-43)
+; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
; EG-NEXT: LSHR T71.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 336(4.708363e-43)
+; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43)
; EG-NEXT: LSHR T72.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 352(4.932571e-43)
+; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; EG-NEXT: LSHR T73.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 368(5.156778e-43)
+; EG-NEXT: 2(2.802597e-45), 256(3.587324e-43)
; EG-NEXT: LSHR T74.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 384(5.380986e-43)
+; EG-NEXT: 2(2.802597e-45), 288(4.035740e-43)
; EG-NEXT: LSHR T75.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 400(5.605194e-43)
+; EG-NEXT: 2(2.802597e-45), 320(4.484155e-43)
; EG-NEXT: LSHR T76.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 416(5.829402e-43)
+; EG-NEXT: 2(2.802597e-45), 352(4.932571e-43)
; EG-NEXT: LSHR T77.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 432(6.053609e-43)
-; EG-NEXT: LSHR T78.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 448(6.277817e-43)
+; EG-NEXT: 2(2.802597e-45), 384(5.380986e-43)
+; EG-NEXT: LSHR * T78.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 339:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 416(5.829402e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T79.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 464(6.502025e-43)
+; EG-NEXT: 2(2.802597e-45), 448(6.277817e-43)
; EG-NEXT: LSHR T80.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 480(6.726233e-43)
@@ -8438,210 +8706,224 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: v_and_b32_e64 v3, 0xf0, s2
+; GFX12-NEXT: v_and_b32_e64 v4, s2, 8
+; GFX12-NEXT: v_and_b32_e64 v12, s3, 2
+; GFX12-NEXT: v_lshrrev_b16 v14, 12, s2
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3
-; GFX12-NEXT: v_and_b32_e32 v34, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT: v_dual_mov_b32 v28, v1 :: v_dual_and_b32 v41, 1, v6
-; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4
-; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4
+; GFX12-NEXT: v_lshrrev_b16 v32, 4, v3
+; GFX12-NEXT: v_lshrrev_b16 v36, 3, v4
+; GFX12-NEXT: v_lshrrev_b16 v41, 1, v12
+; GFX12-NEXT: v_and_b32_e32 v4, 2, v14
; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v3, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v24, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 3, s3
-; GFX12-NEXT: v_and_b32_e32 v50, 1, v14
-; GFX12-NEXT: v_and_b32_e32 v47, 1, v18
-; GFX12-NEXT: v_and_b32_e32 v18, 1, v4
-; GFX12-NEXT: v_and_b32_e32 v14, 1, v6
-; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
-; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5
-; GFX12-NEXT: v_lshrrev_b16 v0, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 1, s2
-; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX12-NEXT: v_and_b32_e32 v42, 1, v8
-; GFX12-NEXT: v_and_b32_e32 v52, 1, v10
-; GFX12-NEXT: v_and_b32_e32 v40, 1, v23
-; GFX12-NEXT: v_dual_mov_b32 v44, v1 :: v_dual_and_b32 v43, 1, v24
-; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5
-; GFX12-NEXT: v_lshrrev_b16 v10, 2, s5
-; GFX12-NEXT: v_lshrrev_b16 v24, 4, s5
-; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10014
-; GFX12-NEXT: v_and_b32_e32 v33, 1, v25
-; GFX12-NEXT: v_and_b32_e32 v25, 1, v6
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10015
-; GFX12-NEXT: v_and_b32_e32 v23, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v16, 11, s3
-; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v35, 1, v5
-; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v5, 1, v12
-; GFX12-NEXT: v_lshrrev_b16 v36, 7, s5
-; GFX12-NEXT: v_lshrrev_b16 v37, 6, s5
-; GFX12-NEXT: v_and_b32_e32 v56, 1, v8
-; GFX12-NEXT: v_and_b32_e32 v4, 1, v10
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v23
-; GFX12-NEXT: v_and_b32_e32 v8, 1, v24
-; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v25
-; GFX12-NEXT: v_and_b32_e32 v23, 1, v2
-; GFX12-NEXT: v_dual_mov_b32 v24, v1 :: v_dual_and_b32 v25, 0xffff, v0
-; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v3
-; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013
-; GFX12-NEXT: v_and_b32_e32 v27, 1, v7
-; GFX12-NEXT: v_lshrrev_b16 v9, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 6, s2
-; GFX12-NEXT: v_and_b32_e32 v22, 1, v16
-; GFX12-NEXT: v_lshrrev_b16 v54, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v55, 1, s4
+; GFX12-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_and_b32 v12, 2, v32
+; GFX12-NEXT: s_lshr_b32 s7, s3, 16
+; GFX12-NEXT: v_lshrrev_b16 v28, 8, s2
+; GFX12-NEXT: v_and_b32_e64 v8, 0xf0, s3
+; GFX12-NEXT: v_and_b32_e64 v10, s3, 8
+; GFX12-NEXT: v_lshrrev_b16 v33, 8, s3
+; GFX12-NEXT: v_and_b32_e64 v46, 0xf0, s7
+; GFX12-NEXT: v_and_b32_e64 v18, s5, 8
+; GFX12-NEXT: v_lshrrev_b16 v34, 1, v12
+; GFX12-NEXT: v_lshrrev_b16 v12, 4, s5
+; GFX12-NEXT: s_lshr_b32 s6, s3, 24
+; GFX12-NEXT: s_lshr_b32 s4, s2, 16
+; GFX12-NEXT: v_and_b32_e64 v6, s2, 2
+; GFX12-NEXT: v_and_b32_e64 v16, s6, 8
+; GFX12-NEXT: v_and_b32_e64 v15, 0xf0, s4
+; GFX12-NEXT: v_lshrrev_b16 v13, 7, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 2, v28
+; GFX12-NEXT: v_lshrrev_b16 v40, 4, s6
+; GFX12-NEXT: v_lshrrev_b16 v19, 7, v8
+; GFX12-NEXT: v_lshrrev_b16 v39, 3, v10
+; GFX12-NEXT: v_and_b32_e32 v10, 8, v33
+; GFX12-NEXT: v_lshrrev_b16 v37, 3, v18
+; GFX12-NEXT: v_lshrrev_b16 v18, 4, v8
+; GFX12-NEXT: v_lshrrev_b16 v8, 4, v46
+; GFX12-NEXT: v_and_b32_e32 v25, 2, v12
+; GFX12-NEXT: v_lshrrev_b16 v2, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v0, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v38, 1, v6
+; GFX12-NEXT: v_and_b32_e32 v6, 8, v28
+; GFX12-NEXT: v_lshrrev_b16 v49, 3, v16
+; GFX12-NEXT: v_lshrrev_b16 v16, 1, v4
+; GFX12-NEXT: v_lshrrev_b16 v31, 1, v3
+; GFX12-NEXT: v_lshrrev_b16 v3, 4, v15
+; GFX12-NEXT: v_and_b32_e32 v4, 2, v40
+; GFX12-NEXT: v_and_b32_e32 v24, 2, v8
+; GFX12-NEXT: v_lshrrev_b16 v52, 1, v25
+; GFX12-NEXT: v_mov_b32_e32 v25, v1
+; GFX12-NEXT: v_lshrrev_b16 v44, 3, v10
+; GFX12-NEXT: v_and_b32_e32 v10, 2, v18
+; GFX12-NEXT: v_lshrrev_b16 v29, 10, s2
+; GFX12-NEXT: v_lshrrev_b16 v30, 3, v6
+; GFX12-NEXT: v_and_b32_e32 v6, 2, v3
+; GFX12-NEXT: v_lshrrev_b16 v50, 1, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 1, v3
+; GFX12-NEXT: v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v0, 1, v0
+; GFX12-NEXT: v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX12-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-NEXT: v_lshrrev_b16 v51, 1, v10
+; GFX12-NEXT: v_lshrrev_b16 v10, 1, v24
+; GFX12-NEXT: v_and_b32_e32 v24, 1, v14
+; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v16
+; GFX12-NEXT: v_lshrrev_b16 v22, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v35, 2, s5
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:96
-; GFX12-NEXT: v_and_b32_e32 v23, 1, v37
-; GFX12-NEXT: v_and_b32_e32 v25, 0xffff, v36
-; GFX12-NEXT: v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v28, 0xffff, v34
-; GFX12-NEXT: v_dual_mov_b32 v59, v1 :: v_dual_and_b32 v34, 1, v11
-; GFX12-NEXT: v_dual_mov_b32 v35, v1 :: v_dual_and_b32 v36, 0xffff, v35
-; GFX12-NEXT: v_dual_mov_b32 v37, v1 :: v_dual_and_b32 v26, 1, v9
-; GFX12-NEXT: v_mov_b32_e32 v27, v1
-; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v0, 1, v55
-; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:64
-; GFX12-NEXT: v_and_b32_e32 v34, 1, v13
-; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v41
-; GFX12-NEXT: v_and_b32_e32 v2, 1, v54
-; GFX12-NEXT: global_store_b128 v1, v[26:29], s[0:1] offset:80
-; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v0
-; GFX12-NEXT: v_mov_b32_e32 v0, s7
-; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:48
-; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v2
-; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10016
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 14, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
-; GFX12-NEXT: v_mov_b32_e32 v0, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10012
-; GFX12-NEXT: v_lshrrev_b16 v19, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v32, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
-; GFX12-NEXT: v_mov_b32_e32 v0, s8
-; GFX12-NEXT: v_mov_b32_e32 v2, s9
-; GFX12-NEXT: v_lshrrev_b16 v39, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v28, 10, s3
-; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:96
+; GFX12-NEXT: v_and_b32_e32 v24, 1, v29
+; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v30
+; GFX12-NEXT: v_and_b32_e32 v28, 1, v28
+; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v31
+; GFX12-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_and_b32 v0, 2, v33
+; GFX12-NEXT: v_lshrrev_b16 v43, 7, s6
+; GFX12-NEXT: v_lshrrev_b16 v42, 6, s6
+; GFX12-NEXT: v_lshrrev_b16 v23, 2, s6
+; GFX12-NEXT: v_and_b32_e64 v45, s6, 2
+; GFX12-NEXT: s_and_b32 s6, s2, 1
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, s3
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1] offset:64
+; GFX12-NEXT: v_and_b32_e32 v28, 1, v35
+; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v37
+; GFX12-NEXT: v_and_b32_e32 v35, 1, v22
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v38
+; GFX12-NEXT: v_mov_b32_e32 v38, v1
+; GFX12-NEXT: v_lshrrev_b16 v22, 1, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_and_b32 s6, s3, 1
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10011
+; GFX12-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_and_b32 v37, 0xffff, v36
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v56, 0xffff, v5
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:16
+; GFX12-NEXT: s_bfe_u32 s6, s3, 0x10018
+; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v22
+; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v41
+; GFX12-NEXT: v_lshrrev_b16 v21, 2, s3
+; GFX12-NEXT: v_and_b32_e32 v35, 1, v33
+; GFX12-NEXT: v_lshrrev_b16 v7, 14, s3
+; GFX12-NEXT: v_lshrrev_b16 v20, 12, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, v22
+; GFX12-NEXT: v_lshrrev_b16 v22, 1, v45
+; GFX12-NEXT: v_and_b32_e32 v24, 1, v40
+; GFX12-NEXT: v_and_b32_e32 v38, 1, v21
+; GFX12-NEXT: v_and_b32_e32 v40, 0xffff, v39
+; GFX12-NEXT: v_lshrrev_b16 v21, 10, s3
+; GFX12-NEXT: v_mov_b32_e32 v39, v1
+; GFX12-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_and_b32 v22, 0xffff, v22
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: s_bfe_u32 s6, s3, 0x10016
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v54, 1, v7
+; GFX12-NEXT: global_store_b128 v1, v[38:41], s[0:1] offset:272
+; GFX12-NEXT: v_dual_mov_b32 v2, v22 :: v_dual_and_b32 v39, 1, v21
+; GFX12-NEXT: v_and_b32_e32 v21, 2, v20
+; GFX12-NEXT: v_and_b32_e64 v47, s7, 8
+; GFX12-NEXT: v_lshrrev_b16 v25, 7, v46
+; GFX12-NEXT: v_lshrrev_b16 v6, 1, v6
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: v_lshrrev_b16 v21, 1, v21
+; GFX12-NEXT: v_and_b32_e64 v48, s7, 2
+; GFX12-NEXT: v_lshrrev_b16 v27, 3, v47
+; GFX12-NEXT: v_dual_mov_b32 v45, v1 :: v_dual_and_b32 v6, 0xffff, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v21
+; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v25
+; GFX12-NEXT: v_and_b32_e32 v41, 0xffff, v44
+; GFX12-NEXT: v_and_b32_e32 v42, 1, v42
+; GFX12-NEXT: v_dual_mov_b32 v43, v1 :: v_dual_and_b32 v44, 0xffff, v43
+; GFX12-NEXT: v_mov_b32_e32 v2, v21
+; GFX12-NEXT: v_lshrrev_b16 v29, 1, v48
+; GFX12-NEXT: v_and_b32_e32 v31, 0xffff, v27
+; GFX12-NEXT: v_lshrrev_b16 v11, 7, s5
+; GFX12-NEXT: v_lshrrev_b16 v17, 6, s5
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v1, v[42:45], s[0:1] offset:496
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX12-NEXT: v_and_b32_e32 v42, 1, v23
+; GFX12-NEXT: v_and_b32_e64 v23, s5, 2
+; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v9, 6, s3
+; GFX12-NEXT: v_and_b32_e64 v21, s4, 2
+; GFX12-NEXT: v_dual_mov_b32 v2, v31 :: v_dual_and_b32 v29, 0xffff, v29
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v49
+; GFX12-NEXT: v_and_b32_e32 v46, 1, v9
+; GFX12-NEXT: v_and_b32_e64 v9, s4, 8
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v15, 4, s2
-; GFX12-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX12-NEXT: v_lshrrev_b16 v0, 1, v21
+; GFX12-NEXT: v_lshrrev_b16 v3, 1, v23
+; GFX12-NEXT: v_and_b32_e32 v48, 0xffff, v19
+; GFX12-NEXT: v_mov_b32_e32 v2, v29
+; GFX12-NEXT: v_lshrrev_b16 v9, 3, v9
+; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v19, 0xffff, v0
+; GFX12-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_and_b32 v21, 0xffff, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-NEXT: global_store_b128 v1, v[42:45], s[0:1] offset:464
+; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v11
+; GFX12-NEXT: v_lshrrev_b16 v11, 7, v15
+; GFX12-NEXT: v_and_b32_e32 v42, 1, v17
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
-; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10015
-; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v43
-; GFX12-NEXT: v_and_b32_e32 v41, 1, v15
+; GFX12-NEXT: v_lshrrev_b16 v17, 6, s2
+; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10010
+; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10012
+; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10016
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10018
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_and_b32 v33, 0xffff, v9
+; GFX12-NEXT: v_dual_mov_b32 v2, v21 :: v_dual_and_b32 v15, 0xffff, v11
+; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v20, 1, v20
+; GFX12-NEXT: v_dual_mov_b32 v23, v1 :: v_dual_mov_b32 v38, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, v15
+; GFX12-NEXT: global_store_b128 v1, v[42:45], s[0:1] offset:240
+; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v29, v1
+; GFX12-NEXT: v_mov_b32_e32 v31, v1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
-; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v17, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v46, 7, s4
-; GFX12-NEXT: v_lshrrev_b16 v49, 6, s4
-; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v43, 0xffff, v42
-; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v45, 1, v32
-; GFX12-NEXT: v_and_b32_e32 v47, 0xffff, v47
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
-; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 4, s4
-; GFX12-NEXT: v_lshrrev_b16 v12, 2, s4
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s7, s2, 1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010
-; GFX12-NEXT: v_and_b32_e32 v51, 1, v17
-; GFX12-NEXT: v_dual_mov_b32 v54, v1 :: v_dual_and_b32 v53, 0xffff, v52
-; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: v_mov_b32_e32 v52, v1
-; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:32
-; GFX12-NEXT: v_and_b32_e32 v41, 1, v49
-; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v46
-; GFX12-NEXT: v_mov_b32_e32 v13, v1
-; GFX12-NEXT: v_and_b32_e32 v35, 0xffff, v56
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
-; GFX12-NEXT: v_mov_b32_e32 v0, s7
-; GFX12-NEXT: v_mov_b32_e32 v46, v1
-; GFX12-NEXT: v_mov_b32_e32 v2, v37
-; GFX12-NEXT: v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v16, 1, v16
-; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v48, 1, v19
-; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX12-NEXT: global_store_b128 v1, v[51:54], s[0:1] offset:16
-; GFX12-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v52, 1, v21
-; GFX12-NEXT: v_and_b32_e32 v54, 0xffff, v20
-; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v50, 0xffff, v50
-; GFX12-NEXT: v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v18, 0xffff, v18
-; GFX12-NEXT: v_mov_b32_e32 v51, v1
-; GFX12-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_and_b32 v27, 1, v39
-; GFX12-NEXT: v_and_b32_e32 v38, 1, v38
-; GFX12-NEXT: v_and_b32_e32 v40, 0xffff, v40
-; GFX12-NEXT: v_and_b32_e32 v56, 1, v28
-; GFX12-NEXT: v_and_b32_e32 v58, 0xffff, v22
-; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:496
-; GFX12-NEXT: global_store_b128 v1, v[52:55], s[0:1] offset:368
-; GFX12-NEXT: global_store_b128 v1, v[48:51], s[0:1] offset:352
-; GFX12-NEXT: v_mov_b32_e32 v41, v1
-; GFX12-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT: v_mov_b32_e32 v2, v36
-; GFX12-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_and_b32 v33, 0xffff, v33
-; GFX12-NEXT: v_mov_b32_e32 v32, v1
-; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[56:59], s[0:1] offset:336
-; GFX12-NEXT: global_store_b128 v1, v[45:48], s[0:1] offset:320
-; GFX12-NEXT: global_store_b128 v1, v[38:41], s[0:1] offset:304
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256
; GFX12-NEXT: v_mov_b32_e32 v0, s5
-; GFX12-NEXT: v_mov_b32_e32 v2, v30
-; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v12, 1, v12
-; GFX12-NEXT: v_mov_b32_e32 v15, v1
-; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:272
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:480
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448
+; GFX12-NEXT: v_mov_b32_e32 v2, v33
+; GFX12-NEXT: v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v8, 1, v8
+; GFX12-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_and_b32 v10, 0xffff, v10
+; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v52
+; GFX12-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v16, 1, v18
+; GFX12-NEXT: v_dual_mov_b32 v47, v1 :: v_dual_and_b32 v18, 0xffff, v51
+; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v50
+; GFX12-NEXT: v_and_b32_e32 v50, 1, v17
+; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v52, 0xffff, v13
+; GFX12-NEXT: v_mov_b32_e32 v51, v1
+; GFX12-NEXT: v_mov_b32_e32 v49, v1
+; GFX12-NEXT: v_mov_b32_e32 v17, v1
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v1, v[54:57], s[0:1] offset:368
+; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:352
+; GFX12-NEXT: global_store_b128 v1, v[39:42], s[0:1] offset:336
+; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:320
+; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1] offset:208
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v2, v35 :: v_dual_mov_b32 v9, v1
+; GFX12-NEXT: v_mov_b32_e32 v2, v19
+; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v32, 1, v32
+; GFX12-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_and_b32 v34, 0xffff, v34
+; GFX12-NEXT: v_mov_b32_e32 v35, v1
+; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v12, 1, v12
; GFX12-NEXT: v_mov_b32_e32 v11, v1
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v28, v1
-; GFX12-NEXT: v_mov_b32_e32 v30, v1
-; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:464
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:288
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: v_mov_b32_e32 v9, v1
+; GFX12-NEXT: s_clause 0x8
+; GFX12-NEXT: global_store_b128 v1, v[50:53], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[32:35], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:480
+; GFX12-NEXT: global_store_b128 v1, v[46:49], s[0:1] offset:304
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:288
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:224
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:416
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:160
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index a87fa8bf36d9e7..ece93a7cb5ce7a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1135,7 +1135,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -1144,9 +1144,11 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T4.Y, T4.X, literal.x,
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: LSHR * T4.Y, PV.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
+; EG-NEXT: AND_INT T4.X, T0.Y, literal.x,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
@@ -1222,8 +1224,8 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1
+; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
@@ -1231,11 +1233,13 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
-; EG-NEXT: LSHR T0.W, T4.X, literal.x,
-; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: BFE_INT T4.X, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T0.W, PV.Y, literal.x,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
-; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T4.Y, PV.W, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
;
; GFX12-LABEL: constant_sextload_v2i16_to_v2i32:
@@ -1524,7 +1528,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -1533,13 +1537,17 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T5.W, T5.Y, literal.x,
+; EG-NEXT: MOV T2.X, T5.X,
+; EG-NEXT: MOV * T3.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV * T0.Z, PS,
+; EG-NEXT: LSHR * T5.W, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T5.Z, T5.Y, literal.x,
+; EG-NEXT: AND_INT * T5.Z, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR * T5.Y, T5.X, literal.x,
+; EG-NEXT: LSHR * T5.Y, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T5.X, T5.X, literal.x,
+; EG-NEXT: AND_INT T5.X, T0.Y, literal.x,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
@@ -1632,8 +1640,8 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
+; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
@@ -1641,16 +1649,20 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x,
+; EG-NEXT: MOV T2.X, T5.X,
+; EG-NEXT: MOV * T3.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV * T0.Z, PS,
+; EG-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T5.Y, literal.x,
+; EG-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T5.X, literal.x,
+; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
-; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y,
+; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT * T5.Y, PS, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; GFX12-LABEL: constant_sextload_v4i16_to_v4i32:
@@ -1777,29 +1789,37 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
+; EG-NEXT: ALU 25, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T8.W, T7.Y, literal.x,
+; EG-NEXT: MOV T2.X, T7.X,
+; EG-NEXT: MOV * T3.X, T7.Y,
+; EG-NEXT: MOV T4.X, T7.Z,
+; EG-NEXT: MOV * T5.X, T7.W,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: MOV T0.W, T2.X,
+; EG-NEXT: MOV * T1.Y, T3.X,
+; EG-NEXT: LSHR * T7.W, PS, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T8.Z, T7.Y, literal.x,
+; EG-NEXT: AND_INT * T7.Z, T1.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T8.Y, T7.X, literal.x,
-; EG-NEXT: LSHR * T9.W, T7.W, literal.x,
+; EG-NEXT: LSHR T7.Y, T0.W, literal.x,
+; EG-NEXT: LSHR * T8.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T8.X, T7.X, literal.x,
-; EG-NEXT: AND_INT T9.Z, T7.W, literal.x,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T7.X, T0.W, literal.x,
+; EG-NEXT: AND_INT T8.Z, T0.Z, literal.x,
+; EG-NEXT: LSHR * T9.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT: LSHR * T9.Y, T7.Z, literal.x,
+; EG-NEXT: LSHR * T8.Y, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T9.X, T7.Z, literal.x,
+; EG-NEXT: AND_INT T8.X, T0.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: LSHR * T10.X, PV.W, literal.x,
@@ -1937,34 +1957,42 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
+; EG-NEXT: ALU 27, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT * T8.Z, T7.Y, 0.0, literal.x,
+; EG-NEXT: MOV T2.X, T7.X,
+; EG-NEXT: MOV * T3.X, T7.Y,
+; EG-NEXT: MOV T4.X, T7.Z,
+; EG-NEXT: MOV * T5.X, T7.W,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: MOV T0.W, T2.X,
+; EG-NEXT: MOV * T1.Y, T3.X,
+; EG-NEXT: BFE_INT * T7.Z, PS, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T8.X, T7.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T9.Z, T7.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T7.Y, literal.x,
+; EG-NEXT: BFE_INT T7.X, T0.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T8.Z, T0.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T9.X, T7.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Z, T7.W, literal.x,
-; EG-NEXT: BFE_INT T8.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT: BFE_INT T8.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T0.Z, T0.Z, literal.x,
+; EG-NEXT: BFE_INT T7.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT: BFE_INT T8.Y, PS, 0.0, literal.y,
-; EG-NEXT: LSHR T1.Z, T7.Z, literal.y,
-; EG-NEXT: BFE_INT T9.W, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T7.Y, PS, 0.0, literal.y,
+; EG-NEXT: LSHR T1.Z, T0.Y, literal.y,
+; EG-NEXT: BFE_INT T8.W, PV.Z, 0.0, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T10.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: BFE_INT * T8.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; GFX12-LABEL: constant_sextload_v8i16_to_v8i32:
@@ -2166,50 +2194,70 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; EG-LABEL: constant_zextload_v16i16_to_v16i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @8
-; EG-NEXT: ALU 35, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T12.X, 1
+; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @10
+; EG-NEXT: ALU 3, @15, KC0[], KC1[]
+; EG-NEXT: TEX 0 @12
+; EG-NEXT: ALU 47, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T17.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 8:
+; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
-; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 13:
-; EG-NEXT: LSHR * T13.W, T12.Y, literal.x,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: MOV T6.X, T12.X,
+; EG-NEXT: MOV * T7.X, T12.Y,
+; EG-NEXT: MOV T8.X, T12.Z,
+; EG-NEXT: MOV * T9.X, T12.W,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: MOV T2.X, T11.X,
+; EG-NEXT: MOV * T3.X, T11.Y,
+; EG-NEXT: MOV T4.X, T11.Z,
+; EG-NEXT: MOV * T5.X, T11.W,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: MOV T0.W, T2.X,
+; EG-NEXT: MOV * T1.Y, T3.X,
+; EG-NEXT: MOV T1.Z, T8.X,
+; EG-NEXT: MOV * T1.W, T9.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.Y, T6.X,
+; EG-NEXT: MOV * T2.Z, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T11.W, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T13.Z, T12.Y, literal.x,
+; EG-NEXT: AND_INT * T11.Z, T2.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T13.Y, T12.X, literal.x,
-; EG-NEXT: LSHR * T14.W, T12.W, literal.x,
+; EG-NEXT: LSHR T11.Y, T2.Y, literal.x,
+; EG-NEXT: LSHR * T12.W, T1.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T13.X, T12.X, literal.x,
-; EG-NEXT: AND_INT T14.Z, T12.W, literal.x,
-; EG-NEXT: LSHR * T12.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T11.X, T2.Y, literal.x,
+; EG-NEXT: AND_INT T12.Z, T1.W, literal.x,
+; EG-NEXT: LSHR * T13.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT: LSHR T14.Y, T12.Z, literal.x,
-; EG-NEXT: LSHR * T15.W, T11.Y, literal.x,
+; EG-NEXT: LSHR T12.Y, T1.Z, literal.x,
+; EG-NEXT: LSHR * T14.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T14.X, T12.Z, literal.x,
-; EG-NEXT: AND_INT T15.Z, T11.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T12.X, T1.Z, literal.x,
+; EG-NEXT: AND_INT T14.Z, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T16.X, PV.W, literal.x,
-; EG-NEXT: LSHR T15.Y, T11.X, literal.y,
-; EG-NEXT: LSHR T17.W, T11.W, literal.y,
-; EG-NEXT: AND_INT * T15.X, T11.X, literal.z,
+; EG-NEXT: LSHR T15.X, PV.W, literal.x,
+; EG-NEXT: LSHR T14.Y, T0.W, literal.y,
+; EG-NEXT: LSHR T16.W, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T14.X, T0.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T17.Z, T11.W, literal.x,
+; EG-NEXT: AND_INT T16.Z, T0.Z, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT: LSHR T11.X, PV.W, literal.x,
-; EG-NEXT: LSHR T17.Y, T11.Z, literal.y,
-; EG-NEXT: AND_INT * T17.X, T11.Z, literal.z,
+; EG-NEXT: LSHR T17.X, PV.W, literal.x,
+; EG-NEXT: LSHR T16.Y, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T16.X, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
@@ -2430,59 +2478,79 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; EG-LABEL: constant_sextload_v16i16_to_v16i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @8
-; EG-NEXT: ALU 39, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T13.X, 1
+; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @10
+; EG-NEXT: ALU 3, @15, KC0[], KC1[]
+; EG-NEXT: TEX 0 @12
+; EG-NEXT: ALU 51, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T17.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
-; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: Fetch clause starting at 10:
+; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 13:
-; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: MOV T6.X, T12.X,
+; EG-NEXT: MOV * T7.X, T12.Y,
+; EG-NEXT: MOV T8.X, T12.Z,
+; EG-NEXT: MOV * T9.X, T12.W,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: MOV T2.X, T11.X,
+; EG-NEXT: MOV * T3.X, T11.Y,
+; EG-NEXT: MOV T4.X, T11.Z,
+; EG-NEXT: MOV * T5.X, T11.W,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, T2.X,
+; EG-NEXT: MOV T0.W, T8.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.Y, T3.X,
+; EG-NEXT: MOV T1.Z, T9.X,
+; EG-NEXT: MOV * T1.W, T5.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.Y, T6.X,
+; EG-NEXT: MOV * T2.Z, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T14.X, PV.W, literal.x,
-; EG-NEXT: BFE_INT * T15.Z, T11.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T12.X, PV.W, literal.x,
+; EG-NEXT: BFE_INT * T13.Z, T2.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T15.X, T11.X, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Y, T12.W, literal.x,
-; EG-NEXT: BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T0.W, T12.Y, literal.x,
-; EG-NEXT: LSHR * T1.W, T11.Y, literal.x,
+; EG-NEXT: BFE_INT T13.X, T2.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T3.Y, T1.W, literal.x,
+; EG-NEXT: BFE_INT T14.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T2.W, T1.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T3.W, T2.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T16.X, T11.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T1.Y, T11.W, literal.x,
-; EG-NEXT: BFE_INT T17.Z, T12.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T15.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T11.X, literal.x,
+; EG-NEXT: BFE_INT T14.X, T0.W, 0.0, literal.x,
+; EG-NEXT: LSHR T4.Y, T1.Z, literal.x,
+; EG-NEXT: BFE_INT T15.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T13.W, PS, 0.0, literal.x,
+; EG-NEXT: LSHR * T3.W, T2.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T17.X, T12.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T15.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T18.Z, T12.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T16.W, PV.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T11.Z, literal.x,
+; EG-NEXT: BFE_INT T15.X, T0.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T13.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T16.Z, T1.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T14.W, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T18.X, T12.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T16.Y, PS, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Z, T12.X, literal.x,
-; EG-NEXT: BFE_INT T17.W, T0.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T16.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T14.Y, PS, 0.0, literal.x,
+; EG-NEXT: LSHR T0.Z, T0.Z, literal.x,
+; EG-NEXT: BFE_INT T15.W, T2.W, 0.0, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
-; EG-NEXT: LSHR T11.X, PS, literal.x,
-; EG-NEXT: BFE_INT T17.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT: LSHR T0.Z, T12.Z, literal.y,
-; EG-NEXT: BFE_INT T18.W, T0.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T17.X, PS, literal.x,
+; EG-NEXT: BFE_INT T15.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T0.Z, T0.Y, literal.y,
+; EG-NEXT: BFE_INT T16.W, T3.Y, 0.0, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T12.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T18.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T16.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; GFX12-LABEL: constant_sextload_v16i16_to_v16i32:
@@ -2842,92 +2910,136 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
;
; EG-LABEL: constant_zextload_v32i16_to_v32i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 3 @12
-; EG-NEXT: ALU 71, @21, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T34.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T32.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T29.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T19.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T20.X, 1
+; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @18
+; EG-NEXT: ALU 3, @27, KC0[], KC1[]
+; EG-NEXT: TEX 0 @20
+; EG-NEXT: ALU 3, @31, KC0[], KC1[]
+; EG-NEXT: TEX 0 @22
+; EG-NEXT: ALU 3, @35, KC0[], KC1[]
+; EG-NEXT: TEX 0 @24
+; EG-NEXT: ALU 91, @39, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T34.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T33.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T31.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T27.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: Fetch clause starting at 18:
; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
-; EG-NEXT: VTX_READ_128 T21.XYZW, T19.X, 48, #1
-; EG-NEXT: VTX_READ_128 T22.XYZW, T19.X, 32, #1
-; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 16, #1
-; EG-NEXT: ALU clause starting at 20:
+; EG-NEXT: Fetch clause starting at 20:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT: Fetch clause starting at 22:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; EG-NEXT: Fetch clause starting at 24:
+; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; EG-NEXT: ALU clause starting at 26:
; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 21:
-; EG-NEXT: LSHR * T23.W, T20.Y, literal.x,
+; EG-NEXT: ALU clause starting at 27:
+; EG-NEXT: MOV T14.X, T20.X,
+; EG-NEXT: MOV * T15.X, T20.Y,
+; EG-NEXT: MOV T16.X, T20.Z,
+; EG-NEXT: MOV * T17.X, T20.W,
+; EG-NEXT: ALU clause starting at 31:
+; EG-NEXT: MOV T10.X, T20.X,
+; EG-NEXT: MOV * T11.X, T20.Y,
+; EG-NEXT: MOV T12.X, T20.Z,
+; EG-NEXT: MOV * T13.X, T20.W,
+; EG-NEXT: ALU clause starting at 35:
+; EG-NEXT: MOV T6.X, T20.X,
+; EG-NEXT: MOV * T7.X, T20.Y,
+; EG-NEXT: MOV T8.X, T20.Z,
+; EG-NEXT: MOV * T9.X, T20.W,
+; EG-NEXT: ALU clause starting at 39:
+; EG-NEXT: MOV T2.X, T19.X,
+; EG-NEXT: MOV * T3.X, T19.Y,
+; EG-NEXT: MOV T4.X, T19.Z,
+; EG-NEXT: MOV * T5.X, T19.W,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: MOV T0.W, T2.X,
+; EG-NEXT: MOV * T1.Y, T3.X,
+; EG-NEXT: MOV T1.Z, T8.X,
+; EG-NEXT: MOV * T1.W, T9.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.Y, T6.X,
+; EG-NEXT: MOV T2.Z, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T12.X, BS:VEC_201
+; EG-NEXT: MOV T3.Y, T13.X,
+; EG-NEXT: MOV T3.Z, T10.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T11.X, BS:VEC_201
+; EG-NEXT: MOV T4.Y, T16.X,
+; EG-NEXT: MOV T4.Z, T17.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T4.W, T14.X, BS:VEC_201
+; EG-NEXT: MOV * T5.Y, T15.X,
+; EG-NEXT: LSHR * T19.W, PV.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T23.Z, T20.Y, literal.x,
+; EG-NEXT: AND_INT * T19.Z, T5.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T23.Y, T20.X, literal.x,
-; EG-NEXT: LSHR * T24.W, T20.W, literal.x,
+; EG-NEXT: LSHR T19.Y, T4.W, literal.x,
+; EG-NEXT: LSHR * T20.W, T4.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T23.X, T20.X, literal.x,
-; EG-NEXT: AND_INT T24.Z, T20.W, literal.x,
-; EG-NEXT: LSHR * T20.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T19.X, T4.W, literal.x,
+; EG-NEXT: AND_INT T20.Z, T4.Z, literal.x,
+; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT: LSHR T24.Y, T20.Z, literal.x,
-; EG-NEXT: LSHR * T25.W, T19.Y, literal.x,
+; EG-NEXT: LSHR T20.Y, T4.Y, literal.x,
+; EG-NEXT: LSHR * T22.W, T3.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T24.X, T20.Z, literal.x,
-; EG-NEXT: AND_INT T25.Z, T19.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T20.X, T4.Y, literal.x,
+; EG-NEXT: AND_INT T22.Z, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T26.X, PV.W, literal.x,
-; EG-NEXT: LSHR T25.Y, T19.X, literal.y,
-; EG-NEXT: LSHR T27.W, T19.W, literal.y,
-; EG-NEXT: AND_INT * T25.X, T19.X, literal.z,
+; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: LSHR T22.Y, T3.Z, literal.y,
+; EG-NEXT: LSHR T24.W, T3.Y, literal.y,
+; EG-NEXT: AND_INT * T22.X, T3.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T27.Z, T19.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T24.Z, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT: LSHR T19.X, PV.W, literal.x,
-; EG-NEXT: LSHR T27.Y, T19.Z, literal.y,
-; EG-NEXT: LSHR T28.W, T22.Y, literal.y,
-; EG-NEXT: AND_INT * T27.X, T19.Z, literal.z,
+; EG-NEXT: LSHR T25.X, PV.W, literal.x,
+; EG-NEXT: LSHR T24.Y, T2.W, literal.y,
+; EG-NEXT: LSHR T26.W, T2.Z, literal.y,
+; EG-NEXT: AND_INT * T24.X, T2.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T28.Z, T22.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T26.Z, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
-; EG-NEXT: LSHR T29.X, PV.W, literal.x,
-; EG-NEXT: LSHR T28.Y, T22.X, literal.y,
-; EG-NEXT: LSHR T30.W, T22.W, literal.y,
-; EG-NEXT: AND_INT * T28.X, T22.X, literal.z,
+; EG-NEXT: LSHR T27.X, PV.W, literal.x,
+; EG-NEXT: LSHR T26.Y, T2.Y, literal.y,
+; EG-NEXT: LSHR T28.W, T1.W, literal.y,
+; EG-NEXT: AND_INT * T26.X, T2.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T30.Z, T22.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T28.Z, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
-; EG-NEXT: LSHR T22.X, PV.W, literal.x,
-; EG-NEXT: LSHR T30.Y, T22.Z, literal.y,
-; EG-NEXT: LSHR T31.W, T21.Y, literal.y,
-; EG-NEXT: AND_INT * T30.X, T22.Z, literal.z,
+; EG-NEXT: LSHR T29.X, PV.W, literal.x,
+; EG-NEXT: LSHR T28.Y, T1.Z, literal.y,
+; EG-NEXT: LSHR T30.W, T1.Y, literal.y,
+; EG-NEXT: AND_INT * T28.X, T1.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T31.Z, T21.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T30.Z, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
-; EG-NEXT: LSHR T32.X, PV.W, literal.x,
-; EG-NEXT: LSHR T31.Y, T21.X, literal.y,
-; EG-NEXT: LSHR T33.W, T21.W, literal.y,
-; EG-NEXT: AND_INT * T31.X, T21.X, literal.z,
+; EG-NEXT: LSHR T31.X, PV.W, literal.x,
+; EG-NEXT: LSHR T30.Y, T0.W, literal.y,
+; EG-NEXT: LSHR T32.W, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T30.X, T0.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T33.Z, T21.W, literal.x,
+; EG-NEXT: AND_INT T32.Z, T0.Z, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
-; EG-NEXT: LSHR T21.X, PV.W, literal.x,
-; EG-NEXT: LSHR T33.Y, T21.Z, literal.y,
-; EG-NEXT: AND_INT * T33.X, T21.Z, literal.z,
+; EG-NEXT: LSHR T33.X, PV.W, literal.x,
+; EG-NEXT: LSHR T32.Y, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T32.X, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
@@ -3322,107 +3434,155 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
;
; EG-LABEL: constant_sextload_v32i16_to_v32i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 8, @20, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 3 @12
-; EG-NEXT: ALU 73, @29, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T24.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T28.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T27.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T19.X, 1
+; EG-NEXT: ALU 0, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @20
+; EG-NEXT: ALU 3, @29, KC0[], KC1[]
+; EG-NEXT: TEX 0 @22
+; EG-NEXT: ALU 3, @33, KC0[], KC1[]
+; EG-NEXT: TEX 0 @24
+; EG-NEXT: ALU 3, @37, KC0[], KC1[]
+; EG-NEXT: TEX 0 @26
+; EG-NEXT: ALU 82, @41, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 19, @124, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T34.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T33.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T22.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T21.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T20.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_128 T23.XYZW, T22.X, 16, #1
-; EG-NEXT: VTX_READ_128 T24.XYZW, T22.X, 32, #1
-; EG-NEXT: VTX_READ_128 T25.XYZW, T22.X, 0, #1
-; EG-NEXT: VTX_READ_128 T22.XYZW, T22.X, 48, #1
-; EG-NEXT: ALU clause starting at 20:
-; EG-NEXT: LSHR T19.X, KC0[2].Y, literal.x,
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 20:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
+; EG-NEXT: Fetch clause starting at 22:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT: Fetch clause starting at 24:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; EG-NEXT: Fetch clause starting at 26:
+; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; EG-NEXT: ALU clause starting at 28:
+; EG-NEXT: MOV * T19.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 29:
+; EG-NEXT: MOV T14.X, T20.X,
+; EG-NEXT: MOV * T15.X, T20.Y,
+; EG-NEXT: MOV T16.X, T20.Z,
+; EG-NEXT: MOV * T17.X, T20.W,
+; EG-NEXT: ALU clause starting at 33:
+; EG-NEXT: MOV T10.X, T20.X,
+; EG-NEXT: MOV * T11.X, T20.Y,
+; EG-NEXT: MOV T12.X, T20.Z,
+; EG-NEXT: MOV * T13.X, T20.W,
+; EG-NEXT: ALU clause starting at 37:
+; EG-NEXT: MOV T6.X, T20.X,
+; EG-NEXT: MOV * T7.X, T20.Y,
+; EG-NEXT: MOV T8.X, T20.Z,
+; EG-NEXT: MOV * T9.X, T20.W,
+; EG-NEXT: ALU clause starting at 41:
+; EG-NEXT: LSHR T20.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T20.X, PV.W, literal.x,
+; EG-NEXT: LSHR T21.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T21.X, PV.W, literal.x,
-; EG-NEXT: MOV * T22.X, KC0[2].Z,
+; EG-NEXT: LSHR T22.X, PV.W, literal.x,
+; EG-NEXT: MOV * T2.X, T19.X,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 29:
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T3.X, T19.Y,
+; EG-NEXT: MOV * T4.X, T19.Z,
+; EG-NEXT: MOV T5.X, T19.W,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: MOV T0.Z, T2.X,
+; EG-NEXT: MOV T0.W, T8.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.Y, T6.X,
+; EG-NEXT: MOV T1.Z, T12.X,
+; EG-NEXT: MOV * T1.W, T10.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.Y, T16.X,
+; EG-NEXT: MOV T2.Z, T11.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T17.X, BS:VEC_201
+; EG-NEXT: MOV T3.Y, T13.X,
+; EG-NEXT: MOV T3.Z, T14.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T7.X, BS:VEC_201
+; EG-NEXT: MOV T4.Y, T9.X,
+; EG-NEXT: MOV T4.Z, T15.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T4.W, T3.X, BS:VEC_201
+; EG-NEXT: MOV T5.Y, T5.X,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T26.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T19.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR T27.X, PV.W, literal.x,
-; EG-NEXT: LSHR T0.W, T22.W, literal.y,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: LSHR T5.W, T5.Y, literal.y,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T28.X, PS, literal.x,
-; EG-NEXT: LSHR T0.Y, T22.Y, literal.y,
-; EG-NEXT: BFE_INT T29.Z, T25.Y, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T1.W, T24.W, literal.y,
-; EG-NEXT: LSHR * T2.W, T24.Y, literal.y,
+; EG-NEXT: LSHR T24.X, PS, literal.x,
+; EG-NEXT: LSHR T6.Y, T4.W, literal.y,
+; EG-NEXT: BFE_INT T25.Z, T4.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T6.W, T4.Y, literal.y,
+; EG-NEXT: LSHR * T7.W, T3.W, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T29.X, T25.X, 0.0, literal.x,
-; EG-NEXT: LSHR T1.Y, T23.W, literal.x,
-; EG-NEXT: BFE_INT T30.Z, T25.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T3.W, T23.Y, literal.x,
-; EG-NEXT: LSHR * T4.W, T25.Y, literal.x,
+; EG-NEXT: BFE_INT T25.X, T3.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T7.Y, T3.Y, literal.x,
+; EG-NEXT: BFE_INT T26.Z, T2.W, 0.0, literal.x,
+; EG-NEXT: LSHR T8.W, T2.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T9.W, T4.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T30.X, T25.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T2.Y, T25.W, literal.x,
-; EG-NEXT: BFE_INT T31.Z, T23.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T29.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T4.W, T25.X, literal.x,
+; EG-NEXT: BFE_INT T26.X, T2.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T8.Y, T2.W, literal.x,
+; EG-NEXT: BFE_INT T27.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T25.W, PS, 0.0, literal.x,
+; EG-NEXT: LSHR * T2.W, T3.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T31.X, T23.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T29.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T32.Z, T23.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T30.W, PV.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T4.W, T25.Z, literal.x,
+; EG-NEXT: BFE_INT T27.X, T1.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T25.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T28.Z, T3.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T26.W, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T2.W, T2.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T32.X, T23.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T30.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T25.Z, T24.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T31.W, T3.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T3.W, T23.X, literal.x,
+; EG-NEXT: BFE_INT T28.X, T1.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T26.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T29.Z, T3.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T27.W, T8.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T1.W, T1.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T25.X, T24.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T31.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T33.Z, T24.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T32.W, T1.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T3.W, T23.Z, literal.x,
+; EG-NEXT: BFE_INT T29.X, T1.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T27.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T30.Z, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T28.W, T7.Y, 0.0, literal.x, BS:VEC_201
+; EG-NEXT: LSHR * T1.W, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T33.X, T24.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T32.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T23.Z, T22.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T25.W, T2.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T2.W, T24.X, literal.x,
+; EG-NEXT: BFE_INT T30.X, T0.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T28.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T31.Z, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T29.W, T7.W, 0.0, literal.x, BS:VEC_201
+; EG-NEXT: LSHR * T1.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T23.X, T22.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T25.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T34.Z, T22.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T33.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T1.W, T24.Z, literal.x,
+; EG-NEXT: BFE_INT T31.X, T0.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T29.Y, PS, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T34.X, T22.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T33.Y, PS, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Z, T22.X, literal.x,
-; EG-NEXT: BFE_INT T23.W, T0.Y, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: ALU clause starting at 124:
+; EG-NEXT: BFE_INT T32.Z, T5.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T30.W, T6.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T32.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T30.Y, PS, 0.0, literal.x,
+; EG-NEXT: LSHR T0.Z, T0.Z, literal.x,
+; EG-NEXT: BFE_INT T31.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43)
-; EG-NEXT: LSHR T22.X, PS, literal.x,
-; EG-NEXT: BFE_INT T23.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT: LSHR T0.Z, T22.Z, literal.y,
-; EG-NEXT: BFE_INT T34.W, T0.W, 0.0, literal.y,
+; EG-NEXT: LSHR T33.X, PS, literal.x,
+; EG-NEXT: BFE_INT T31.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T0.Z, T0.Y, literal.y,
+; EG-NEXT: BFE_INT T32.W, T5.W, 0.0, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T24.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T34.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T34.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T32.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; GFX12-LABEL: constant_sextload_v32i16_to_v32i32:
@@ -4107,180 +4267,272 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
;
; EG-LABEL: constant_zextload_v64i16_to_v64i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @38, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 3 @22
-; EG-NEXT: ALU 55, @39, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 3 @30
-; EG-NEXT: ALU 87, @95, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T66.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T49.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T64.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T50.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T61.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T51.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T58.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T52.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T53.XYZW, T55.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T39.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T48.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T46.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T38.X, 1
+; EG-NEXT: ALU 0, @52, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @36
+; EG-NEXT: ALU 3, @53, KC0[], KC1[]
+; EG-NEXT: TEX 0 @38
+; EG-NEXT: ALU 3, @57, KC0[], KC1[]
+; EG-NEXT: TEX 0 @40
+; EG-NEXT: ALU 3, @61, KC0[], KC1[]
+; EG-NEXT: TEX 0 @42
+; EG-NEXT: ALU 3, @65, KC0[], KC1[]
+; EG-NEXT: TEX 0 @44
+; EG-NEXT: ALU 3, @69, KC0[], KC1[]
+; EG-NEXT: TEX 0 @46
+; EG-NEXT: ALU 3, @73, KC0[], KC1[]
+; EG-NEXT: TEX 0 @48
+; EG-NEXT: ALU 3, @77, KC0[], KC1[]
+; EG-NEXT: TEX 0 @50
+; EG-NEXT: ALU 96, @81, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 83, @178, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T66.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T65.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T63.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T61.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T59.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T57.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T55.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T53.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T51.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T49.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T47.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T45.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T43.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T41.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T39.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T37.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T38.XYZW, T37.X, 0, #1
-; EG-NEXT: VTX_READ_128 T39.XYZW, T37.X, 48, #1
-; EG-NEXT: VTX_READ_128 T40.XYZW, T37.X, 32, #1
-; EG-NEXT: VTX_READ_128 T41.XYZW, T37.X, 16, #1
-; EG-NEXT: Fetch clause starting at 30:
-; EG-NEXT: VTX_READ_128 T49.XYZW, T37.X, 112, #1
-; EG-NEXT: VTX_READ_128 T50.XYZW, T37.X, 96, #1
-; EG-NEXT: VTX_READ_128 T51.XYZW, T37.X, 80, #1
-; EG-NEXT: VTX_READ_128 T52.XYZW, T37.X, 64, #1
-; EG-NEXT: ALU clause starting at 38:
-; EG-NEXT: MOV * T37.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 39:
-; EG-NEXT: LSHR * T35.W, T38.Y, literal.x,
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 36:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1
+; EG-NEXT: Fetch clause starting at 38:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1
+; EG-NEXT: Fetch clause starting at 40:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 32, #1
+; EG-NEXT: Fetch clause starting at 42:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 48, #1
+; EG-NEXT: Fetch clause starting at 44:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 64, #1
+; EG-NEXT: Fetch clause starting at 46:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 80, #1
+; EG-NEXT: Fetch clause starting at 48:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 96, #1
+; EG-NEXT: Fetch clause starting at 50:
+; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 112, #1
+; EG-NEXT: ALU clause starting at 52:
+; EG-NEXT: MOV * T35.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 53:
+; EG-NEXT: MOV T30.X, T36.X,
+; EG-NEXT: MOV * T31.X, T36.Y,
+; EG-NEXT: MOV T32.X, T36.Z,
+; EG-NEXT: MOV * T33.X, T36.W,
+; EG-NEXT: ALU clause starting at 57:
+; EG-NEXT: MOV T26.X, T36.X,
+; EG-NEXT: MOV * T27.X, T36.Y,
+; EG-NEXT: MOV T28.X, T36.Z,
+; EG-NEXT: MOV * T29.X, T36.W,
+; EG-NEXT: ALU clause starting at 61:
+; EG-NEXT: MOV T22.X, T36.X,
+; EG-NEXT: MOV * T23.X, T36.Y,
+; EG-NEXT: MOV T24.X, T36.Z,
+; EG-NEXT: MOV * T25.X, T36.W,
+; EG-NEXT: ALU clause starting at 65:
+; EG-NEXT: MOV T18.X, T36.X,
+; EG-NEXT: MOV * T19.X, T36.Y,
+; EG-NEXT: MOV T20.X, T36.Z,
+; EG-NEXT: MOV * T21.X, T36.W,
+; EG-NEXT: ALU clause starting at 69:
+; EG-NEXT: MOV T14.X, T36.X,
+; EG-NEXT: MOV * T15.X, T36.Y,
+; EG-NEXT: MOV T16.X, T36.Z,
+; EG-NEXT: MOV * T17.X, T36.W,
+; EG-NEXT: ALU clause starting at 73:
+; EG-NEXT: MOV T10.X, T36.X,
+; EG-NEXT: MOV * T11.X, T36.Y,
+; EG-NEXT: MOV T12.X, T36.Z,
+; EG-NEXT: MOV * T13.X, T36.W,
+; EG-NEXT: ALU clause starting at 77:
+; EG-NEXT: MOV T6.X, T36.X,
+; EG-NEXT: MOV * T7.X, T36.Y,
+; EG-NEXT: MOV T8.X, T36.Z,
+; EG-NEXT: MOV * T9.X, T36.W,
+; EG-NEXT: ALU clause starting at 81:
+; EG-NEXT: MOV T2.X, T35.X,
+; EG-NEXT: MOV * T3.X, T35.Y,
+; EG-NEXT: MOV T4.X, T35.Z,
+; EG-NEXT: MOV * T5.X, T35.W,
+; EG-NEXT: MOV T0.Y, T19.X,
+; EG-NEXT: MOV T0.Z, T24.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T25.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T22.X,
+; EG-NEXT: MOV T1.Z, T23.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T28.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T29.X,
+; EG-NEXT: MOV T2.Z, T26.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T27.X, BS:VEC_201
+; EG-NEXT: MOV T3.Y, T32.X,
+; EG-NEXT: MOV T3.Z, T33.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T30.X, BS:VEC_201
+; EG-NEXT: MOV * T4.Y, T31.X,
+; EG-NEXT: LSHR * T35.W, PV.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T35.Z, T38.Y, literal.x,
+; EG-NEXT: AND_INT * T35.Z, T4.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T35.Y, T38.X, literal.x,
-; EG-NEXT: LSHR * T36.W, T38.W, literal.x,
+; EG-NEXT: LSHR T35.Y, T3.W, literal.x,
+; EG-NEXT: LSHR * T36.W, T3.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T35.X, T38.X, literal.x,
-; EG-NEXT: AND_INT T36.Z, T38.W, literal.x,
-; EG-NEXT: LSHR * T38.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T35.X, T3.W, literal.x,
+; EG-NEXT: AND_INT T36.Z, T3.Z, literal.x,
+; EG-NEXT: LSHR * T37.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT: LSHR T36.Y, T38.Z, literal.x,
-; EG-NEXT: LSHR * T42.W, T41.Y, literal.x,
+; EG-NEXT: LSHR T36.Y, T3.Y, literal.x,
+; EG-NEXT: LSHR * T38.W, T2.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T36.X, T38.Z, literal.x,
-; EG-NEXT: AND_INT T42.Z, T41.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T36.X, T3.Y, literal.x,
+; EG-NEXT: AND_INT T38.Z, T2.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T43.X, PV.W, literal.x,
-; EG-NEXT: LSHR T42.Y, T41.X, literal.y,
-; EG-NEXT: LSHR T44.W, T41.W, literal.y,
-; EG-NEXT: AND_INT * T42.X, T41.X, literal.z,
+; EG-NEXT: LSHR T39.X, PV.W, literal.x,
+; EG-NEXT: LSHR T38.Y, T2.Z, literal.y,
+; EG-NEXT: LSHR T40.W, T2.Y, literal.y,
+; EG-NEXT: AND_INT * T38.X, T2.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T44.Z, T41.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T40.Z, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
; EG-NEXT: LSHR T41.X, PV.W, literal.x,
-; EG-NEXT: LSHR T44.Y, T41.Z, literal.y,
-; EG-NEXT: LSHR T45.W, T40.Y, literal.y,
-; EG-NEXT: AND_INT * T44.X, T41.Z, literal.z,
+; EG-NEXT: LSHR T40.Y, T1.W, literal.y,
+; EG-NEXT: LSHR T42.W, T1.Z, literal.y,
+; EG-NEXT: AND_INT * T40.X, T1.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T45.Z, T40.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T42.Z, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
-; EG-NEXT: LSHR T46.X, PV.W, literal.x,
-; EG-NEXT: LSHR T45.Y, T40.X, literal.y,
-; EG-NEXT: LSHR T47.W, T40.W, literal.y,
-; EG-NEXT: AND_INT * T45.X, T40.X, literal.z,
+; EG-NEXT: LSHR T43.X, PV.W, literal.x,
+; EG-NEXT: LSHR T42.Y, T1.Y, literal.y,
+; EG-NEXT: LSHR T44.W, T0.W, literal.y,
+; EG-NEXT: AND_INT * T42.X, T1.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T47.Z, T40.W, literal.x,
+; EG-NEXT: AND_INT T44.Z, T0.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
-; EG-NEXT: LSHR T40.X, PV.W, literal.x,
-; EG-NEXT: LSHR T47.Y, T40.Z, literal.y,
-; EG-NEXT: AND_INT * T47.X, T40.Z, literal.z,
+; EG-NEXT: LSHR T45.X, PV.W, literal.x,
+; EG-NEXT: LSHR T44.Y, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T44.X, T0.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: LSHR * T37.W, T39.Y, literal.y,
+; EG-NEXT: LSHR * T46.W, T0.Y, literal.y,
; EG-NEXT: 80(1.121039e-43), 16(2.242078e-44)
-; EG-NEXT: LSHR T48.X, PV.W, literal.x,
-; EG-NEXT: AND_INT * T37.Z, T39.Y, literal.y,
+; EG-NEXT: LSHR T47.X, PV.W, literal.x,
+; EG-NEXT: AND_INT * T46.Z, T0.Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT: ALU clause starting at 95:
-; EG-NEXT: LSHR T37.Y, T39.X, literal.x,
-; EG-NEXT: LSHR * T53.W, T39.W, literal.x,
+; EG-NEXT: MOV T0.Y, T4.X,
+; EG-NEXT: MOV T0.Z, T5.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T2.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T3.X,
+; EG-NEXT: MOV T1.Z, T8.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T9.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T6.X,
+; EG-NEXT: MOV T2.Z, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T12.X, BS:VEC_201
+; EG-NEXT: MOV T3.Y, T13.X,
+; EG-NEXT: MOV T3.Z, T10.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T11.X, BS:VEC_201
+; EG-NEXT: MOV T4.Y, T16.X,
+; EG-NEXT: MOV T4.Z, T17.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T4.W, T14.X, BS:VEC_201
+; EG-NEXT: MOV T5.Y, T15.X,
+; EG-NEXT: MOV T5.Z, T20.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T5.W, T21.X, BS:VEC_201
+; EG-NEXT: MOV * T6.Y, T18.X,
+; EG-NEXT: LSHR T46.Y, PV.Y, literal.x,
+; EG-NEXT: LSHR * T48.W, T5.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T37.X, T39.X, literal.x,
-; EG-NEXT: AND_INT T53.Z, T39.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT * T46.X, T6.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 178:
+; EG-NEXT: AND_INT T48.Z, T5.W, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
-; EG-NEXT: LSHR T39.X, PV.W, literal.x,
-; EG-NEXT: LSHR T53.Y, T39.Z, literal.y,
-; EG-NEXT: LSHR T54.W, T52.Y, literal.y,
-; EG-NEXT: AND_INT * T53.X, T39.Z, literal.z,
+; EG-NEXT: LSHR T49.X, PV.W, literal.x,
+; EG-NEXT: LSHR T48.Y, T5.Z, literal.y,
+; EG-NEXT: LSHR T50.W, T5.Y, literal.y,
+; EG-NEXT: AND_INT * T48.X, T5.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T54.Z, T52.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T50.Z, T5.Y, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
-; EG-NEXT: LSHR T55.X, PV.W, literal.x,
-; EG-NEXT: LSHR T54.Y, T52.X, literal.y,
-; EG-NEXT: LSHR T56.W, T52.W, literal.y,
-; EG-NEXT: AND_INT * T54.X, T52.X, literal.z,
+; EG-NEXT: LSHR T51.X, PV.W, literal.x,
+; EG-NEXT: LSHR T50.Y, T4.W, literal.y,
+; EG-NEXT: LSHR T52.W, T4.Z, literal.y,
+; EG-NEXT: AND_INT * T50.X, T4.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T56.Z, T52.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T52.Z, T4.Z, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43)
-; EG-NEXT: LSHR T52.X, PV.W, literal.x,
-; EG-NEXT: LSHR T56.Y, T52.Z, literal.y,
-; EG-NEXT: LSHR T57.W, T51.Y, literal.y,
-; EG-NEXT: AND_INT * T56.X, T52.Z, literal.z,
+; EG-NEXT: LSHR T53.X, PV.W, literal.x,
+; EG-NEXT: LSHR T52.Y, T4.Y, literal.y,
+; EG-NEXT: LSHR T54.W, T3.W, literal.y,
+; EG-NEXT: AND_INT * T52.X, T4.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T57.Z, T51.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T54.Z, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43)
-; EG-NEXT: LSHR T58.X, PV.W, literal.x,
-; EG-NEXT: LSHR T57.Y, T51.X, literal.y,
-; EG-NEXT: LSHR T59.W, T51.W, literal.y,
-; EG-NEXT: AND_INT * T57.X, T51.X, literal.z,
+; EG-NEXT: LSHR T55.X, PV.W, literal.x,
+; EG-NEXT: LSHR T54.Y, T3.Z, literal.y,
+; EG-NEXT: LSHR T56.W, T3.Y, literal.y,
+; EG-NEXT: AND_INT * T54.X, T3.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T59.Z, T51.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T56.Z, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43)
-; EG-NEXT: LSHR T51.X, PV.W, literal.x,
-; EG-NEXT: LSHR T59.Y, T51.Z, literal.y,
-; EG-NEXT: LSHR T60.W, T50.Y, literal.y,
-; EG-NEXT: AND_INT * T59.X, T51.Z, literal.z,
+; EG-NEXT: LSHR T57.X, PV.W, literal.x,
+; EG-NEXT: LSHR T56.Y, T2.W, literal.y,
+; EG-NEXT: LSHR T58.W, T2.Z, literal.y,
+; EG-NEXT: AND_INT * T56.X, T2.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T60.Z, T50.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T58.Z, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43)
-; EG-NEXT: LSHR T61.X, PV.W, literal.x,
-; EG-NEXT: LSHR T60.Y, T50.X, literal.y,
-; EG-NEXT: LSHR T62.W, T50.W, literal.y,
-; EG-NEXT: AND_INT * T60.X, T50.X, literal.z,
+; EG-NEXT: LSHR T59.X, PV.W, literal.x,
+; EG-NEXT: LSHR T58.Y, T2.Y, literal.y,
+; EG-NEXT: LSHR T60.W, T1.W, literal.y,
+; EG-NEXT: AND_INT * T58.X, T2.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T62.Z, T50.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T60.Z, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43)
-; EG-NEXT: LSHR T50.X, PV.W, literal.x,
-; EG-NEXT: LSHR T62.Y, T50.Z, literal.y,
-; EG-NEXT: LSHR T63.W, T49.Y, literal.y,
-; EG-NEXT: AND_INT * T62.X, T50.Z, literal.z,
+; EG-NEXT: LSHR T61.X, PV.W, literal.x,
+; EG-NEXT: LSHR T60.Y, T1.Z, literal.y,
+; EG-NEXT: LSHR T62.W, T1.Y, literal.y,
+; EG-NEXT: AND_INT * T60.X, T1.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T63.Z, T49.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T62.Z, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43)
-; EG-NEXT: LSHR T64.X, PV.W, literal.x,
-; EG-NEXT: LSHR T63.Y, T49.X, literal.y,
-; EG-NEXT: LSHR T65.W, T49.W, literal.y,
-; EG-NEXT: AND_INT * T63.X, T49.X, literal.z,
+; EG-NEXT: LSHR T63.X, PV.W, literal.x,
+; EG-NEXT: LSHR T62.Y, T0.W, literal.y,
+; EG-NEXT: LSHR T64.W, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T62.X, T0.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T65.Z, T49.W, literal.x,
+; EG-NEXT: AND_INT T64.Z, T0.Z, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 224(3.138909e-43)
-; EG-NEXT: LSHR T49.X, PV.W, literal.x,
-; EG-NEXT: LSHR T65.Y, T49.Z, literal.y,
-; EG-NEXT: AND_INT * T65.X, T49.Z, literal.z,
+; EG-NEXT: LSHR T65.X, PV.W, literal.x,
+; EG-NEXT: LSHR T64.Y, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T64.X, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
@@ -5026,205 +5278,300 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
;
; EG-LABEL: constant_sextload_v64i16_to_v64i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 17, @38, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 7 @22
-; EG-NEXT: ALU 75, @56, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 71, @132, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T66.XYZW, T48.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T56.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T55.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T54.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T53.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T52.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T51.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T50.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T49.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T39.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T38.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T37.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T36.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T35.X, 1
+; EG-NEXT: ALU 0, @52, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @36
+; EG-NEXT: ALU 3, @53, KC0[], KC1[]
+; EG-NEXT: TEX 0 @38
+; EG-NEXT: ALU 3, @57, KC0[], KC1[]
+; EG-NEXT: TEX 0 @40
+; EG-NEXT: ALU 3, @61, KC0[], KC1[]
+; EG-NEXT: TEX 0 @42
+; EG-NEXT: ALU 3, @65, KC0[], KC1[]
+; EG-NEXT: TEX 0 @44
+; EG-NEXT: ALU 3, @69, KC0[], KC1[]
+; EG-NEXT: TEX 0 @46
+; EG-NEXT: ALU 3, @73, KC0[], KC1[]
+; EG-NEXT: TEX 0 @48
+; EG-NEXT: ALU 3, @77, KC0[], KC1[]
+; EG-NEXT: TEX 0 @50
+; EG-NEXT: ALU 90, @81, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 74, @172, KC0[], KC1[]
+; EG-NEXT: ALU 36, @247, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T66.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T65.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T48.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T47.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T46.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T45.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T44.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T43.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T42.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T55.XYZW, T35.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T41.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T53.XYZW, T40.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T39.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T51.XYZW, T38.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T37.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T36.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T42.XYZW, T41.X, 16, #1
-; EG-NEXT: VTX_READ_128 T43.XYZW, T41.X, 32, #1
-; EG-NEXT: VTX_READ_128 T44.XYZW, T41.X, 0, #1
-; EG-NEXT: VTX_READ_128 T45.XYZW, T41.X, 48, #1
-; EG-NEXT: VTX_READ_128 T46.XYZW, T41.X, 64, #1
-; EG-NEXT: VTX_READ_128 T47.XYZW, T41.X, 80, #1
-; EG-NEXT: VTX_READ_128 T48.XYZW, T41.X, 96, #1
-; EG-NEXT: VTX_READ_128 T41.XYZW, T41.X, 112, #1
-; EG-NEXT: ALU clause starting at 38:
-; EG-NEXT: LSHR T35.X, KC0[2].Y, literal.x,
+; EG-NEXT: Fetch clause starting at 36:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1
+; EG-NEXT: Fetch clause starting at 38:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1
+; EG-NEXT: Fetch clause starting at 40:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 32, #1
+; EG-NEXT: Fetch clause starting at 42:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 48, #1
+; EG-NEXT: Fetch clause starting at 44:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 64, #1
+; EG-NEXT: Fetch clause starting at 46:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 80, #1
+; EG-NEXT: Fetch clause starting at 48:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 96, #1
+; EG-NEXT: Fetch clause starting at 50:
+; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 112, #1
+; EG-NEXT: ALU clause starting at 52:
+; EG-NEXT: MOV * T35.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 53:
+; EG-NEXT: MOV T30.X, T36.X,
+; EG-NEXT: MOV * T31.X, T36.Y,
+; EG-NEXT: MOV T32.X, T36.Z,
+; EG-NEXT: MOV * T33.X, T36.W,
+; EG-NEXT: ALU clause starting at 57:
+; EG-NEXT: MOV T26.X, T36.X,
+; EG-NEXT: MOV * T27.X, T36.Y,
+; EG-NEXT: MOV T28.X, T36.Z,
+; EG-NEXT: MOV * T29.X, T36.W,
+; EG-NEXT: ALU clause starting at 61:
+; EG-NEXT: MOV T22.X, T36.X,
+; EG-NEXT: MOV * T23.X, T36.Y,
+; EG-NEXT: MOV T24.X, T36.Z,
+; EG-NEXT: MOV * T25.X, T36.W,
+; EG-NEXT: ALU clause starting at 65:
+; EG-NEXT: MOV T18.X, T36.X,
+; EG-NEXT: MOV * T19.X, T36.Y,
+; EG-NEXT: MOV T20.X, T36.Z,
+; EG-NEXT: MOV * T21.X, T36.W,
+; EG-NEXT: ALU clause starting at 69:
+; EG-NEXT: MOV T14.X, T36.X,
+; EG-NEXT: MOV * T15.X, T36.Y,
+; EG-NEXT: MOV T16.X, T36.Z,
+; EG-NEXT: MOV * T17.X, T36.W,
+; EG-NEXT: ALU clause starting at 73:
+; EG-NEXT: MOV T10.X, T36.X,
+; EG-NEXT: MOV * T11.X, T36.Y,
+; EG-NEXT: MOV T12.X, T36.Z,
+; EG-NEXT: MOV * T13.X, T36.W,
+; EG-NEXT: ALU clause starting at 77:
+; EG-NEXT: MOV T6.X, T36.X,
+; EG-NEXT: MOV * T7.X, T36.Y,
+; EG-NEXT: MOV T8.X, T36.Z,
+; EG-NEXT: MOV * T9.X, T36.W,
+; EG-NEXT: ALU clause starting at 81:
+; EG-NEXT: LSHR T36.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T36.X, PV.W, literal.x,
+; EG-NEXT: LSHR T37.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T37.X, PV.W, literal.x,
+; EG-NEXT: LSHR T38.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT: LSHR T38.X, PV.W, literal.x,
+; EG-NEXT: LSHR T39.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR T39.X, PV.W, literal.x,
+; EG-NEXT: LSHR T40.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT: LSHR T40.X, PV.W, literal.x,
-; EG-NEXT: MOV * T41.X, KC0[2].Z,
+; EG-NEXT: LSHR T41.X, PV.W, literal.x,
+; EG-NEXT: MOV * T2.X, T35.X,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 56:
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T3.X, T35.Y,
+; EG-NEXT: MOV * T4.X, T35.Z,
+; EG-NEXT: MOV T5.X, T35.W,
+; EG-NEXT: MOV T0.Y, T24.X,
+; EG-NEXT: MOV T0.Z, T25.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T22.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T28.X,
+; EG-NEXT: MOV T1.Z, T26.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T32.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T27.X,
+; EG-NEXT: MOV T2.Z, T33.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T29.X, BS:VEC_201
+; EG-NEXT: MOV T3.Y, T30.X,
+; EG-NEXT: MOV T3.Z, T23.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T19.X, BS:VEC_201
+; EG-NEXT: MOV T4.Y, T31.X,
+; EG-NEXT: MOV T4.Z, T21.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T4.W, T15.X, BS:VEC_201
+; EG-NEXT: MOV T5.Y, T17.X,
+; EG-NEXT: MOV T5.Z, T11.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T5.W, T13.X, BS:VEC_201
+; EG-NEXT: MOV T6.Y, T7.X,
+; EG-NEXT: MOV T6.Z, T9.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T6.W, T3.X, BS:VEC_201
+; EG-NEXT: MOV T7.Y, T5.X,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x,
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T49.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T35.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT: LSHR T50.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T42.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT: LSHR T51.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T43.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT: LSHR T52.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T44.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT: LSHR T53.X, PV.W, literal.x,
-; EG-NEXT: LSHR T0.Y, T41.W, literal.y,
-; EG-NEXT: LSHR T0.Z, T41.Y, literal.y,
-; EG-NEXT: LSHR T0.W, T48.W, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T45.X, PV.W, literal.x,
+; EG-NEXT: LSHR T7.Z, T7.Y, literal.y,
+; EG-NEXT: LSHR T7.W, T6.W, literal.y,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T54.X, PS, literal.x,
-; EG-NEXT: LSHR T1.Y, T48.Y, literal.y,
-; EG-NEXT: LSHR T1.Z, T47.W, literal.y,
-; EG-NEXT: LSHR T1.W, T47.Y, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T46.X, PS, literal.x,
+; EG-NEXT: LSHR T8.Y, T6.Z, literal.y,
+; EG-NEXT: LSHR T8.Z, T6.Y, literal.y,
+; EG-NEXT: LSHR T8.W, T5.W, literal.y,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T55.X, PS, literal.x,
-; EG-NEXT: LSHR T2.Y, T46.W, literal.y,
-; EG-NEXT: LSHR T2.Z, T46.Y, literal.y,
-; EG-NEXT: LSHR T2.W, T45.W, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T47.X, PS, literal.x,
+; EG-NEXT: LSHR T9.Y, T5.Z, literal.y,
+; EG-NEXT: LSHR T9.Z, T5.Y, literal.y,
+; EG-NEXT: LSHR T9.W, T4.W, literal.y,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T56.X, PS, literal.x,
-; EG-NEXT: LSHR T3.Y, T45.Y, literal.y,
-; EG-NEXT: BFE_INT T57.Z, T44.Y, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T3.W, T43.W, literal.y,
-; EG-NEXT: LSHR * T4.W, T43.Y, literal.y,
+; EG-NEXT: LSHR T48.X, PS, literal.x,
+; EG-NEXT: LSHR T10.Y, T4.Z, literal.y,
+; EG-NEXT: BFE_INT T49.Z, T4.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T10.W, T3.W, literal.y,
+; EG-NEXT: LSHR * T11.W, T3.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T57.X, T44.X, 0.0, literal.x,
-; EG-NEXT: LSHR T4.Y, T42.W, literal.x,
-; EG-NEXT: BFE_INT T58.Z, T44.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T5.W, T42.Y, literal.x,
-; EG-NEXT: LSHR * T6.W, T44.Y, literal.x,
+; EG-NEXT: BFE_INT T49.X, T3.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T11.Y, T2.W, literal.x,
+; EG-NEXT: BFE_INT T50.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T12.W, T2.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T13.W, T4.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T58.X, T44.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T5.Y, T44.W, literal.x,
-; EG-NEXT: BFE_INT T59.Z, T42.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T57.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T44.X, literal.x,
+; EG-NEXT: BFE_INT * T50.X, T1.W, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T59.X, T42.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T57.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T60.Z, T42.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T58.W, PV.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T44.Z, literal.x,
+; EG-NEXT: ALU clause starting at 172:
+; EG-NEXT: LSHR T4.Y, T2.Z, literal.x,
+; EG-NEXT: BFE_INT T51.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T49.W, T13.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T13.W, T3.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T60.X, T42.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T58.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T44.Z, T43.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T59.W, T5.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T5.W, T42.X, literal.x,
+; EG-NEXT: BFE_INT T51.X, T1.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T49.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T52.Z, T2.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T50.W, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T1.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T44.X, T43.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T59.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T61.Z, T43.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T60.W, T4.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T5.W, T42.Z, literal.x,
+; EG-NEXT: BFE_INT T52.X, T1.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T50.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T53.Z, T3.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T51.W, T12.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T61.X, T43.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T60.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.Z, T45.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T44.W, T4.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T53.X, T0.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T51.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T54.Z, T0.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T52.W, T11.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 132:
-; EG-NEXT: LSHR * T4.W, T43.X, literal.x,
+; EG-NEXT: BFE_INT T54.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T52.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T55.Z, T3.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T53.W, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T1.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T42.X, T45.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T44.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T62.Z, T45.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T61.W, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T3.W, T43.Z, literal.x,
+; EG-NEXT: MOV * T0.Z, T4.X,
+; EG-NEXT: MOV T1.Y, T2.X,
+; EG-NEXT: MOV T1.Z, T8.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.Y, T6.X,
+; EG-NEXT: MOV T2.Z, T12.X,
+; EG-NEXT: MOV * T2.W, T10.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T3.Y, T16.X,
+; EG-NEXT: MOV T3.Z, T14.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T20.X, BS:VEC_201
+; EG-NEXT: MOV T4.Y, T18.X,
+; EG-NEXT: LSHR * T0.W, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T62.X, T45.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T61.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.Z, T46.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.W, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T3.W, T45.X, literal.x,
+; EG-NEXT: BFE_INT T55.X, PV.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T53.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T56.Z, T4.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T54.W, T1.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T43.X, T46.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T63.Z, T46.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T62.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T2.W, T45.Z, literal.x,
+; EG-NEXT: BFE_INT T56.X, T3.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T54.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T57.Z, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T55.W, T10.W, 0.0, literal.x, BS:VEC_201
+; EG-NEXT: LSHR * T0.W, T4.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T63.X, T46.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T62.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.Z, T47.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T2.W, T46.X, literal.x,
+; EG-NEXT: BFE_INT T57.X, T3.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T55.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T58.Z, T5.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T56.W, T10.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T0.W, T3.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T45.X, T47.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T64.Z, T47.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T63.W, T2.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T2.W, T46.Z, literal.x,
+; EG-NEXT: BFE_INT T58.X, T3.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T56.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T59.Z, T5.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T57.W, T9.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T3.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T64.X, T47.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T63.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T46.Z, T48.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.W, T1.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T47.X, literal.x,
+; EG-NEXT: BFE_INT T59.X, T2.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T57.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T60.Z, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T58.W, T9.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T3.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T46.X, T48.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T65.Z, T48.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T64.W, T1.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T47.Z, literal.x,
+; EG-NEXT: BFE_INT T60.X, T2.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T58.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T61.Z, T6.Y, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 247:
+; EG-NEXT: BFE_INT T59.W, T9.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T2.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T61.X, T2.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T59.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T62.Z, T6.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T60.W, T8.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T2.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T65.X, T48.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T64.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T47.Z, T41.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T46.W, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T1.W, T48.X, literal.x,
+; EG-NEXT: BFE_INT T62.X, T1.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T60.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T63.Z, T6.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T61.W, T8.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T0.W, T2.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T47.X, T41.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T46.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T66.Z, T41.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T65.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T0.W, T48.Z, literal.x,
+; EG-NEXT: BFE_INT T63.X, T1.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T61.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T64.Z, T7.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T62.W, T8.Y, 0.0, literal.x, BS:VEC_201
+; EG-NEXT: LSHR * T0.W, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T66.X, T41.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T65.Y, PS, 0.0, literal.x,
-; EG-NEXT: LSHR T1.Z, T41.X, literal.x,
-; EG-NEXT: BFE_INT T47.W, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T64.X, T0.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T62.Y, PS, 0.0, literal.x,
+; EG-NEXT: LSHR T1.Z, T1.Y, literal.x,
+; EG-NEXT: BFE_INT T63.W, T7.W, 0.0, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 224(3.138909e-43)
-; EG-NEXT: LSHR T41.X, PS, literal.x,
-; EG-NEXT: BFE_INT T47.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT: LSHR T0.Z, T41.Z, literal.y,
-; EG-NEXT: BFE_INT T66.W, T0.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T65.X, PS, literal.x,
+; EG-NEXT: BFE_INT T63.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T0.Z, T0.Z, literal.y,
+; EG-NEXT: BFE_INT T64.W, T7.Z, 0.0, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T48.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T66.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T66.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T64.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; GFX12-LABEL: constant_sextload_v64i16_to_v64i32:
@@ -5763,7 +6110,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -5772,9 +6119,11 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T4.Z, T4.X, literal.x,
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: LSHR * T4.Z, PV.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
+; EG-NEXT: AND_INT T4.X, T0.Y, literal.x,
; EG-NEXT: MOV T4.Y, 0.0,
; EG-NEXT: MOV T4.W, 0.0,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
@@ -5982,25 +6331,29 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
+; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T6.Z, T5.Y, literal.x,
+; EG-NEXT: MOV T2.X, T5.X,
+; EG-NEXT: MOV * T3.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV * T0.Z, PS,
+; EG-NEXT: LSHR * T5.Z, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T6.X, T5.Y, literal.x,
-; EG-NEXT: MOV T6.Y, 0.0,
-; EG-NEXT: LSHR T5.Z, T5.X, literal.y,
-; EG-NEXT: AND_INT * T5.X, T5.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: AND_INT T5.X, T0.Z, literal.x,
; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV T6.W, 0.0,
-; EG-NEXT: MOV * T5.W, 0.0,
+; EG-NEXT: LSHR T6.Z, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T6.X, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: MOV T6.Y, 0.0,
+; EG-NEXT: MOV T5.W, 0.0,
+; EG-NEXT: MOV * T6.W, 0.0,
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
@@ -6127,32 +6480,36 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
+; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: ASHR * T5.W, T5.X, literal.x,
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
-; EG-NEXT: ASHR T5.Z, T5.X, literal.y,
-; EG-NEXT: ASHR * T7.W, T5.Y, literal.z,
+; EG-NEXT: MOV T2.X, T5.X,
+; EG-NEXT: MOV * T3.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ASHR * T6.W, T5.Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: LSHR T7.X, PV.W, literal.x,
+; EG-NEXT: ASHR T6.Z, T5.Y, literal.y,
+; EG-NEXT: ASHR * T5.W, T5.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
-; EG-NEXT: ASHR * T7.Z, T5.Y, literal.x,
+; EG-NEXT: BFE_INT T6.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: ASHR * T5.Z, T5.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T5.Y, PV.X, literal.y,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
+; EG-NEXT: ASHR T6.Y, PV.X, literal.y,
+; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.z,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: LSHR T8.X, PV.W, literal.x,
-; EG-NEXT: ASHR * T7.Y, PV.X, literal.y,
-; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ASHR * T5.Y, PV.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
; GFX12-LABEL: constant_sextload_v4i16_to_v4i64:
; GFX12: ; %bb.0:
@@ -6312,37 +6669,45 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 30, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T13.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 1
+; EG-NEXT: ALU 38, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T13.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T11.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHR * T8.Z, T7.W, literal.x,
+; EG-NEXT: MOV T3.X, T7.Y,
+; EG-NEXT: MOV * T2.X, T7.X,
+; EG-NEXT: MOV T5.X, T7.W,
+; EG-NEXT: MOV * T4.X, T7.Z,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: MOV T0.Z, T3.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T0.W, PS,
+; EG-NEXT: MOV * T1.Y, PV.X,
+; EG-NEXT: LSHR * T7.Z, PS, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T8.X, T7.W, literal.x,
-; EG-NEXT: MOV T8.Y, 0.0,
-; EG-NEXT: LSHR T9.Z, T7.Z, literal.y,
-; EG-NEXT: AND_INT * T9.X, T7.Z, literal.x,
+; EG-NEXT: AND_INT T7.X, T1.Y, literal.x,
+; EG-NEXT: MOV T7.Y, 0.0,
+; EG-NEXT: LSHR T8.Z, T0.W, literal.y,
+; EG-NEXT: AND_INT * T8.X, T0.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T9.Y, 0.0,
-; EG-NEXT: LSHR * T10.Z, T7.Y, literal.x,
+; EG-NEXT: MOV T8.Y, 0.0,
+; EG-NEXT: LSHR * T9.Z, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T10.X, T7.Y, literal.x,
-; EG-NEXT: MOV T10.Y, 0.0,
-; EG-NEXT: LSHR T7.Z, T7.X, literal.y,
-; EG-NEXT: AND_INT * T7.X, T7.X, literal.x,
+; EG-NEXT: AND_INT T9.X, T0.Z, literal.x,
+; EG-NEXT: MOV T9.Y, 0.0,
+; EG-NEXT: LSHR T10.Z, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T10.X, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T7.Y, 0.0,
-; EG-NEXT: MOV T8.W, 0.0,
-; EG-NEXT: MOV * T9.W, 0.0,
-; EG-NEXT: MOV T10.W, 0.0,
-; EG-NEXT: MOV * T7.W, 0.0,
+; EG-NEXT: MOV T10.Y, 0.0,
+; EG-NEXT: MOV T7.W, 0.0,
+; EG-NEXT: MOV * T8.W, 0.0,
+; EG-NEXT: MOV T9.W, 0.0,
+; EG-NEXT: MOV * T10.W, 0.0,
; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
@@ -6550,51 +6915,54 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 33, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 36, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T9.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T8.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T9.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T5.X, T7.W,
+; EG-NEXT: MOV * T3.X, T7.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T8.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR T9.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: ASHR * T10.W, T7.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T11.X, PV.W, literal.x,
-; EG-NEXT: ASHR T10.Z, T7.X, literal.y,
-; EG-NEXT: ASHR * T12.W, T7.Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T10.X, T7.X, 0.0, literal.x,
-; EG-NEXT: ASHR T12.Z, T7.Y, literal.x,
-; EG-NEXT: ASHR * T13.W, T7.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T12.X, T7.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T10.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T13.Z, T7.Z, literal.x,
-; EG-NEXT: ASHR * T14.W, T7.W, literal.y,
+; EG-NEXT: LSHR * T10.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T11.X, T7.Y, 0.0, literal.x,
+; EG-NEXT: ASHR * T12.W, T7.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T13.X, T7.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T12.Y, PV.X, literal.y,
-; EG-NEXT: ASHR * T14.Z, T7.W, literal.x,
+; EG-NEXT: BFE_INT T13.X, T7.W, 0.0, literal.x,
+; EG-NEXT: ASHR T11.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T12.Z, T7.X, literal.x,
+; EG-NEXT: ASHR * T14.W, T7.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T14.X, T7.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T12.X, T7.X, 0.0, literal.x,
; EG-NEXT: ASHR T13.Y, PV.X, literal.y,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR T14.Z, T7.Z, literal.x,
+; EG-NEXT: ASHR * T11.W, T0.Z, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T14.X, T7.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T12.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T11.Z, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR * T13.W, T0.Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T7.X, PV.W, literal.x,
-; EG-NEXT: ASHR * T14.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T14.Y, PV.X, literal.y,
+; EG-NEXT: ASHR * T13.Z, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
;
; GFX12-LABEL: constant_sextload_v8i16_to_v8i64:
; GFX12: ; %bb.0:
@@ -6861,64 +7229,84 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
;
; EG-LABEL: constant_zextload_v16i16_to_v16i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @12
-; EG-NEXT: ALU 62, @17, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T23.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T21.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T19.X, 1
+; EG-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @14
+; EG-NEXT: ALU 3, @19, KC0[], KC1[]
+; EG-NEXT: TEX 0 @16
+; EG-NEXT: ALU 74, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T22.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T21.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T20.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T19.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
-; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; EG-NEXT: ALU clause starting at 16:
+; EG-NEXT: Fetch clause starting at 14:
+; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; EG-NEXT: Fetch clause starting at 16:
+; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; EG-NEXT: ALU clause starting at 18:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 17:
-; EG-NEXT: LSHR * T13.Z, T12.W, literal.x,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: MOV T7.X, T12.Y,
+; EG-NEXT: MOV * T6.X, T12.X,
+; EG-NEXT: MOV T9.X, T12.W,
+; EG-NEXT: MOV * T8.X, T12.Z,
+; EG-NEXT: ALU clause starting at 23:
+; EG-NEXT: MOV T3.X, T11.Y,
+; EG-NEXT: MOV * T2.X, T11.X,
+; EG-NEXT: MOV T5.X, T11.W,
+; EG-NEXT: MOV * T4.X, T11.Z,
+; EG-NEXT: MOV T0.Y, T6.X,
+; EG-NEXT: MOV T0.Z, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T8.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T9.X,
+; EG-NEXT: MOV T1.Z, T2.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T3.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T4.X,
+; EG-NEXT: MOV * T2.Z, T5.X, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T11.Z, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T13.X, T12.W, literal.x,
+; EG-NEXT: AND_INT T11.X, T2.Z, literal.x,
+; EG-NEXT: MOV T11.Y, 0.0,
+; EG-NEXT: LSHR T12.Z, T2.Y, literal.y,
+; EG-NEXT: AND_INT * T12.X, T2.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: MOV T12.Y, 0.0,
+; EG-NEXT: LSHR * T13.Z, T1.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T13.X, T1.W, literal.x,
; EG-NEXT: MOV T13.Y, 0.0,
-; EG-NEXT: LSHR T14.Z, T12.Z, literal.y,
-; EG-NEXT: AND_INT * T14.X, T12.Z, literal.x,
+; EG-NEXT: LSHR T14.Z, T1.Z, literal.y,
+; EG-NEXT: AND_INT * T14.X, T1.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T14.Y, 0.0,
-; EG-NEXT: LSHR * T15.Z, T12.Y, literal.x,
+; EG-NEXT: LSHR * T15.Z, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T15.X, T12.Y, literal.x,
+; EG-NEXT: AND_INT T15.X, T1.Y, literal.x,
; EG-NEXT: MOV T15.Y, 0.0,
-; EG-NEXT: LSHR T12.Z, T12.X, literal.y,
-; EG-NEXT: AND_INT * T12.X, T12.X, literal.x,
+; EG-NEXT: LSHR T16.Z, T0.W, literal.y,
+; EG-NEXT: AND_INT * T16.X, T0.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T12.Y, 0.0,
-; EG-NEXT: LSHR * T16.Z, T11.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T16.X, T11.W, literal.x,
; EG-NEXT: MOV T16.Y, 0.0,
-; EG-NEXT: LSHR T17.Z, T11.Z, literal.y,
-; EG-NEXT: AND_INT * T17.X, T11.Z, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T17.Y, 0.0,
-; EG-NEXT: LSHR * T18.Z, T11.Y, literal.x,
+; EG-NEXT: LSHR * T17.Z, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T18.X, T11.Y, literal.x,
-; EG-NEXT: MOV T18.Y, 0.0,
-; EG-NEXT: LSHR T11.Z, T11.X, literal.y,
-; EG-NEXT: AND_INT * T11.X, T11.X, literal.x,
+; EG-NEXT: AND_INT T17.X, T0.Z, literal.x,
+; EG-NEXT: MOV T17.Y, 0.0,
+; EG-NEXT: LSHR T18.Z, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T18.X, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T11.Y, 0.0,
+; EG-NEXT: MOV T18.Y, 0.0,
+; EG-NEXT: MOV T11.W, 0.0,
+; EG-NEXT: MOV * T12.W, 0.0,
; EG-NEXT: MOV T13.W, 0.0,
; EG-NEXT: MOV * T14.W, 0.0,
; EG-NEXT: MOV T15.W, 0.0,
-; EG-NEXT: MOV * T12.W, 0.0,
-; EG-NEXT: MOV T16.W, 0.0,
-; EG-NEXT: MOV * T17.W, 0.0,
-; EG-NEXT: MOV T18.W, 0.0,
-; EG-NEXT: MOV * T11.W, 0.0,
+; EG-NEXT: MOV * T16.W, 0.0,
+; EG-NEXT: MOV T17.W, 0.0,
+; EG-NEXT: MOV * T18.W, 0.0,
; EG-NEXT: LSHR T19.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
@@ -7294,90 +7682,102 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
;
; EG-LABEL: constant_sextload_v16i16_to_v16i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @12
-; EG-NEXT: ALU 65, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @14
+; EG-NEXT: ALU 1, @19, KC0[], KC1[]
+; EG-NEXT: TEX 0 @16
+; EG-NEXT: ALU 71, @21, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T18.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T17.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T15.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T13.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T17.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T16.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T13.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: Fetch clause starting at 14:
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
+; EG-NEXT: Fetch clause starting at 16:
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; EG-NEXT: ALU clause starting at 16:
+; EG-NEXT: ALU clause starting at 18:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 17:
-; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: MOV T5.X, T12.W,
+; EG-NEXT: MOV * T3.X, T12.Y,
+; EG-NEXT: ALU clause starting at 21:
+; EG-NEXT: MOV T9.X, T11.W,
+; EG-NEXT: MOV * T7.X, T11.Y,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: MOV T0.Z, T3.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: MOV * T1.Y, PS,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T13.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR T14.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T15.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR T16.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR T17.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT: LSHR * T17.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T18.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: ASHR * T19.W, T11.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T20.X, PV.W, literal.x,
-; EG-NEXT: ASHR T19.Z, T11.X, literal.y,
-; EG-NEXT: ASHR * T21.W, T11.Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T19.X, T11.X, 0.0, literal.x,
-; EG-NEXT: ASHR T21.Z, T11.Y, literal.x,
-; EG-NEXT: ASHR * T22.W, T11.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T21.X, T11.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T19.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T22.Z, T11.Z, literal.x,
-; EG-NEXT: ASHR * T23.W, T11.W, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT: LSHR * T19.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T20.X, T11.Y, 0.0, literal.x,
+; EG-NEXT: ASHR * T21.W, T11.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T22.X, T11.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T21.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T23.Z, T11.W, literal.x,
-; EG-NEXT: ASHR * T24.W, T12.X, literal.y,
+; EG-NEXT: BFE_INT T22.X, T11.W, 0.0, literal.x,
+; EG-NEXT: ASHR T20.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T21.Z, T11.X, literal.x,
+; EG-NEXT: ASHR * T23.W, T11.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T23.X, T11.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T21.X, T11.X, 0.0, literal.x,
; EG-NEXT: ASHR T22.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T24.Z, T12.X, literal.x,
-; EG-NEXT: ASHR * T11.W, T12.Y, literal.y,
+; EG-NEXT: ASHR T23.Z, T11.Z, literal.x,
+; EG-NEXT: ASHR * T20.W, T1.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T24.X, T12.X, 0.0, literal.x,
-; EG-NEXT: ASHR T23.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T11.Z, T12.Y, literal.x,
-; EG-NEXT: ASHR * T25.W, T12.Z, literal.y,
+; EG-NEXT: BFE_INT T23.X, T11.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T21.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T20.Z, T1.Y, literal.x,
+; EG-NEXT: ASHR * T22.W, T0.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T11.X, T12.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T24.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T25.Z, T12.Z, literal.x,
-; EG-NEXT: ASHR * T26.W, T12.W, literal.y,
+; EG-NEXT: ASHR T23.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T22.Z, T0.W, literal.x,
+; EG-NEXT: ASHR * T24.W, T12.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T25.X, T12.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T25.X, T12.W, 0.0, literal.x,
; EG-NEXT: ASHR T11.Y, PV.X, literal.y,
-; EG-NEXT: ASHR * T26.Z, T12.W, literal.x,
+; EG-NEXT: ASHR T24.Z, T12.X, literal.x,
+; EG-NEXT: ASHR * T26.W, T12.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T26.X, T12.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T24.X, T12.X, 0.0, literal.x,
; EG-NEXT: ASHR T25.Y, PV.X, literal.y,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR T26.Z, T12.Z, literal.x,
+; EG-NEXT: ASHR * T11.W, T0.Z, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T26.X, T12.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T24.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T11.Z, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR * T25.W, T0.Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T12.X, PV.W, literal.x,
-; EG-NEXT: ASHR * T26.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T26.Y, PV.X, literal.y,
+; EG-NEXT: ASHR * T25.Z, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
;
; GFX12-LABEL: constant_sextload_v16i16_to_v16i64:
; GFX12: ; %bb.0:
@@ -7864,118 +8264,161 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
;
; EG-LABEL: constant_zextload_v32i16_to_v32i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 2 @22
-; EG-NEXT: ALU 33, @31, KC0[], KC1[]
+; EG-NEXT: ALU 0, @36, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @28
-; EG-NEXT: ALU 92, @65, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T50.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T49.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T48.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T47.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T46.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T45.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T44.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T43.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T39.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T38.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T37.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T36.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T35.X, 1
+; EG-NEXT: ALU 3, @37, KC0[], KC1[]
+; EG-NEXT: TEX 0 @30
+; EG-NEXT: ALU 3, @41, KC0[], KC1[]
+; EG-NEXT: TEX 0 @32
+; EG-NEXT: ALU 3, @45, KC0[], KC1[]
+; EG-NEXT: TEX 0 @34
+; EG-NEXT: ALU 99, @49, KC0[], KC1[]
+; EG-NEXT: ALU 46, @149, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T50.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T49.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T48.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T47.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T46.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T45.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T44.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T43.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T42.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T41.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T40.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T39.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T38.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T37.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T36.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T35.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 48, #1
-; EG-NEXT: VTX_READ_128 T21.XYZW, T19.X, 16, #1
-; EG-NEXT: VTX_READ_128 T22.XYZW, T19.X, 32, #1
+; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 28:
-; EG-NEXT: VTX_READ_128 T29.XYZW, T19.X, 0, #1
-; EG-NEXT: ALU clause starting at 30:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
+; EG-NEXT: Fetch clause starting at 30:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT: Fetch clause starting at 32:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; EG-NEXT: Fetch clause starting at 34:
+; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; EG-NEXT: ALU clause starting at 36:
; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 31:
-; EG-NEXT: LSHR * T23.Z, T20.W, literal.x,
+; EG-NEXT: ALU clause starting at 37:
+; EG-NEXT: MOV T15.X, T20.Y,
+; EG-NEXT: MOV * T14.X, T20.X,
+; EG-NEXT: MOV T17.X, T20.W,
+; EG-NEXT: MOV * T16.X, T20.Z,
+; EG-NEXT: ALU clause starting at 41:
+; EG-NEXT: MOV T11.X, T20.Y,
+; EG-NEXT: MOV * T10.X, T20.X,
+; EG-NEXT: MOV T13.X, T20.W,
+; EG-NEXT: MOV * T12.X, T20.Z,
+; EG-NEXT: ALU clause starting at 45:
+; EG-NEXT: MOV T7.X, T20.Y,
+; EG-NEXT: MOV * T6.X, T20.X,
+; EG-NEXT: MOV T9.X, T20.W,
+; EG-NEXT: MOV * T8.X, T20.Z,
+; EG-NEXT: ALU clause starting at 49:
+; EG-NEXT: MOV T3.X, T19.Y,
+; EG-NEXT: MOV * T2.X, T19.X,
+; EG-NEXT: MOV T5.X, T19.W,
+; EG-NEXT: MOV * T4.X, T19.Z,
+; EG-NEXT: MOV T0.Y, T13.X,
+; EG-NEXT: MOV T0.Z, T6.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T7.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T8.X,
+; EG-NEXT: MOV T1.Z, T9.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T2.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T3.X,
+; EG-NEXT: MOV T2.Z, T4.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T5.X, BS:VEC_201
+; EG-NEXT: LSHR * T19.Z, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T19.X, T2.W, literal.x,
+; EG-NEXT: MOV T19.Y, 0.0,
+; EG-NEXT: LSHR T20.Z, T2.Z, literal.y,
+; EG-NEXT: AND_INT * T20.X, T2.Z, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: MOV T20.Y, 0.0,
+; EG-NEXT: LSHR * T21.Z, T2.Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T21.X, T2.Y, literal.x,
+; EG-NEXT: MOV T21.Y, 0.0,
+; EG-NEXT: LSHR T22.Z, T1.W, literal.y,
+; EG-NEXT: AND_INT * T22.X, T1.W, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: MOV T22.Y, 0.0,
+; EG-NEXT: LSHR * T23.Z, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T23.X, T20.W, literal.x,
+; EG-NEXT: AND_INT T23.X, T1.Z, literal.x,
; EG-NEXT: MOV T23.Y, 0.0,
-; EG-NEXT: LSHR T24.Z, T20.Z, literal.y,
-; EG-NEXT: AND_INT * T24.X, T20.Z, literal.x,
+; EG-NEXT: LSHR T24.Z, T1.Y, literal.y,
+; EG-NEXT: AND_INT * T24.X, T1.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T24.Y, 0.0,
-; EG-NEXT: LSHR * T25.Z, T20.Y, literal.x,
+; EG-NEXT: LSHR * T25.Z, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T25.X, T20.Y, literal.x,
+; EG-NEXT: AND_INT T25.X, T0.W, literal.x,
; EG-NEXT: MOV T25.Y, 0.0,
-; EG-NEXT: LSHR T20.Z, T20.X, literal.y,
-; EG-NEXT: AND_INT * T20.X, T20.X, literal.x,
+; EG-NEXT: LSHR T26.Z, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T26.X, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T20.Y, 0.0,
-; EG-NEXT: LSHR * T26.Z, T22.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T26.X, T22.W, literal.x,
; EG-NEXT: MOV T26.Y, 0.0,
-; EG-NEXT: LSHR T27.Z, T22.Z, literal.y,
-; EG-NEXT: AND_INT * T27.X, T22.Z, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T27.Y, 0.0,
-; EG-NEXT: LSHR * T28.Z, T22.Y, literal.x,
+; EG-NEXT: LSHR T27.Z, T0.Y, literal.x,
+; EG-NEXT: MOV * T0.Z, T14.X,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T28.X, T22.Y, literal.x,
-; EG-NEXT: MOV T28.Y, 0.0,
-; EG-NEXT: LSHR T22.Z, T22.X, literal.y,
-; EG-NEXT: AND_INT * T22.X, T22.X, literal.x,
+; EG-NEXT: MOV * T0.W, T15.X,
+; EG-NEXT: MOV T1.Y, T16.X,
+; EG-NEXT: MOV T1.Z, T17.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T10.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T11.X,
+; EG-NEXT: MOV * T2.Z, T12.X, BS:VEC_120/SCL_212
+; EG-NEXT: AND_INT T27.X, T0.Y, literal.x,
+; EG-NEXT: MOV T27.Y, 0.0,
+; EG-NEXT: LSHR T28.Z, PV.Z, literal.y,
+; EG-NEXT: AND_INT * T28.X, PV.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T22.Y, 0.0,
-; EG-NEXT: LSHR * T19.Z, T21.W, literal.x,
+; EG-NEXT: MOV T28.Y, 0.0,
+; EG-NEXT: LSHR * T29.Z, T2.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 65:
-; EG-NEXT: AND_INT T19.X, T21.W, literal.x,
-; EG-NEXT: MOV T19.Y, 0.0,
-; EG-NEXT: LSHR T30.Z, T21.Z, literal.y,
-; EG-NEXT: AND_INT * T30.X, T21.Z, literal.x,
+; EG-NEXT: AND_INT T29.X, T2.Y, literal.x,
+; EG-NEXT: MOV T29.Y, 0.0,
+; EG-NEXT: LSHR T30.Z, T1.W, literal.y,
+; EG-NEXT: AND_INT * T30.X, T1.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T30.Y, 0.0,
-; EG-NEXT: LSHR * T31.Z, T21.Y, literal.x,
+; EG-NEXT: LSHR * T31.Z, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T31.X, T21.Y, literal.x,
+; EG-NEXT: AND_INT T31.X, T1.Z, literal.x,
; EG-NEXT: MOV T31.Y, 0.0,
-; EG-NEXT: LSHR T21.Z, T21.X, literal.y,
-; EG-NEXT: AND_INT * T21.X, T21.X, literal.x,
+; EG-NEXT: LSHR T32.Z, T1.Y, literal.y,
+; EG-NEXT: AND_INT * T32.X, T1.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T21.Y, 0.0,
-; EG-NEXT: LSHR * T32.Z, T29.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T32.X, T29.W, literal.x,
; EG-NEXT: MOV T32.Y, 0.0,
-; EG-NEXT: LSHR T33.Z, T29.Z, literal.y,
-; EG-NEXT: AND_INT * T33.X, T29.Z, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T33.Y, 0.0,
-; EG-NEXT: LSHR * T34.Z, T29.Y, literal.x,
+; EG-NEXT: LSHR * T33.Z, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T34.X, T29.Y, literal.x,
-; EG-NEXT: MOV T34.Y, 0.0,
-; EG-NEXT: LSHR T29.Z, T29.X, literal.y,
-; EG-NEXT: AND_INT * T29.X, T29.X, literal.x,
+; EG-NEXT: AND_INT T33.X, T0.W, literal.x,
+; EG-NEXT: MOV T33.Y, 0.0,
+; EG-NEXT: LSHR T34.Z, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T34.X, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T29.Y, 0.0,
+; EG-NEXT: MOV T34.Y, 0.0,
+; EG-NEXT: MOV T19.W, 0.0,
+; EG-NEXT: MOV * T20.W, 0.0,
+; EG-NEXT: MOV T21.W, 0.0,
+; EG-NEXT: MOV * T22.W, 0.0,
; EG-NEXT: MOV T23.W, 0.0,
; EG-NEXT: MOV * T24.W, 0.0,
; EG-NEXT: MOV T25.W, 0.0,
-; EG-NEXT: MOV * T20.W, 0.0,
-; EG-NEXT: MOV T26.W, 0.0,
-; EG-NEXT: MOV * T27.W, 0.0,
-; EG-NEXT: MOV T28.W, 0.0,
-; EG-NEXT: MOV * T22.W, 0.0,
-; EG-NEXT: MOV T19.W, 0.0,
+; EG-NEXT: MOV * T26.W, 0.0,
+; EG-NEXT: MOV T27.W, 0.0,
+; EG-NEXT: MOV * T28.W, 0.0,
+; EG-NEXT: MOV T29.W, 0.0,
; EG-NEXT: MOV * T30.W, 0.0,
; EG-NEXT: MOV T31.W, 0.0,
-; EG-NEXT: MOV * T21.W, 0.0,
-; EG-NEXT: MOV T32.W, 0.0,
-; EG-NEXT: MOV * T33.W, 0.0,
-; EG-NEXT: MOV T34.W, 0.0,
-; EG-NEXT: MOV * T29.W, 0.0,
+; EG-NEXT: MOV * T32.W, 0.0,
+; EG-NEXT: MOV T33.W, 0.0,
+; EG-NEXT: MOV * T34.W, 0.0,
+; EG-NEXT: ALU clause starting at 149:
; EG-NEXT: LSHR T35.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
@@ -8688,169 +9131,198 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
;
; EG-LABEL: constant_sextload_v32i16_to_v32i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @22
-; EG-NEXT: ALU 55, @31, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 2 @24
-; EG-NEXT: ALU 74, @87, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T38.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T36.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T34.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T33.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T32.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T31.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T30.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T29.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T28.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T27.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T24.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T21.X, 1
+; EG-NEXT: ALU 0, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @28
+; EG-NEXT: ALU 1, @37, KC0[], KC1[]
+; EG-NEXT: TEX 0 @30
+; EG-NEXT: ALU 1, @39, KC0[], KC1[]
+; EG-NEXT: TEX 0 @32
+; EG-NEXT: ALU 1, @41, KC0[], KC1[]
+; EG-NEXT: TEX 0 @34
+; EG-NEXT: ALU 87, @43, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 54, @131, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T37.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T36.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T35.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T34.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T33.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T32.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T31.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T30.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T29.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T28.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T27.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T23.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
-; EG-NEXT: Fetch clause starting at 24:
-; EG-NEXT: VTX_READ_128 T38.XYZW, T19.X, 48, #1
-; EG-NEXT: VTX_READ_128 T39.XYZW, T19.X, 32, #1
-; EG-NEXT: VTX_READ_128 T40.XYZW, T19.X, 16, #1
-; EG-NEXT: ALU clause starting at 30:
-; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 31:
-; EG-NEXT: LSHR T21.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T22.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 28:
+; EG-NEXT: VTX_READ_128 T19.XYZW, T22.X, 48, #1
+; EG-NEXT: Fetch clause starting at 30:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T22.X, 32, #1
+; EG-NEXT: Fetch clause starting at 32:
+; EG-NEXT: VTX_READ_128 T21.XYZW, T22.X, 16, #1
+; EG-NEXT: Fetch clause starting at 34:
+; EG-NEXT: VTX_READ_128 T22.XYZW, T22.X, 0, #1
+; EG-NEXT: ALU clause starting at 36:
+; EG-NEXT: MOV * T22.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 37:
+; EG-NEXT: MOV T5.X, T19.W,
+; EG-NEXT: MOV * T3.X, T19.Y,
+; EG-NEXT: ALU clause starting at 39:
+; EG-NEXT: MOV T9.X, T20.W,
+; EG-NEXT: MOV * T7.X, T20.Y,
+; EG-NEXT: ALU clause starting at 41:
+; EG-NEXT: MOV T13.X, T21.W,
+; EG-NEXT: MOV * T11.X, T21.Y,
+; EG-NEXT: ALU clause starting at 43:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T23.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR T24.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T25.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR T26.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
+; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; EG-NEXT: LSHR T27.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
+; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; EG-NEXT: LSHR T28.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
+; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
; EG-NEXT: LSHR T29.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
+; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; EG-NEXT: LSHR T30.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT: LSHR T31.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
+; EG-NEXT: LSHR * T31.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T32.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
+; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; EG-NEXT: LSHR T33.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
+; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; EG-NEXT: LSHR T34.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: ASHR * T35.W, T20.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
+; EG-NEXT: LSHR T35.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
; EG-NEXT: LSHR T36.X, PV.W, literal.x,
-; EG-NEXT: ASHR T35.Z, T20.X, literal.y,
-; EG-NEXT: ASHR * T37.W, T20.Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T35.X, T20.X, 0.0, literal.x,
-; EG-NEXT: ASHR * T37.Z, T20.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T37.X, T20.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T35.Y, PV.X, literal.y,
-; EG-NEXT: ASHR * T19.W, T20.Z, literal.y,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
+; EG-NEXT: LSHR * T37.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T38.X, T22.W, 0.0, literal.x,
+; EG-NEXT: ASHR * T39.W, T22.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: ALU clause starting at 87:
-; EG-NEXT: ASHR T19.Z, T20.Z, literal.x,
-; EG-NEXT: ASHR * T41.W, T20.W, literal.y,
+; EG-NEXT: BFE_INT T40.X, T22.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T38.Y, PV.X, literal.y,
+; EG-NEXT: ASHR * T39.Z, T22.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T19.X, T20.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T37.Y, T37.X, literal.y,
-; EG-NEXT: ASHR T41.Z, T20.W, literal.x,
-; EG-NEXT: ASHR * T42.W, T40.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T39.X, T22.X, 0.0, literal.x,
+; EG-NEXT: ASHR T40.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T41.W, T22.Z, literal.y,
+; EG-NEXT: MOV * T17.X, T22.W,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T41.X, T20.W, 0.0, literal.x,
-; EG-NEXT: ASHR T19.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T42.Z, T40.X, literal.x,
-; EG-NEXT: ASHR * T20.W, T40.Y, literal.y,
+; EG-NEXT: MOV T15.X, T22.Y,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: MOV T0.Z, T3.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T9.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T7.X,
+; EG-NEXT: MOV T1.Z, T13.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T11.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T17.X,
+; EG-NEXT: MOV T2.Z, T15.X, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T41.Z, T22.Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR * T40.W, PV.Z, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T41.X, T22.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T39.Y, T39.X, literal.y,
+; EG-NEXT: ASHR T40.Z, T2.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T38.W, T2.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T42.X, T40.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T22.X, T21.Y, 0.0, literal.x,
; EG-NEXT: ASHR T41.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T20.Z, T40.Y, literal.x,
-; EG-NEXT: ASHR * T43.W, T40.Z, literal.y,
+; EG-NEXT: ASHR T38.Z, T2.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T42.W, T21.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T20.X, T40.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T42.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T43.Z, T40.Z, literal.x,
-; EG-NEXT: ASHR * T44.W, T40.W, literal.y,
+; EG-NEXT: BFE_INT T43.X, T21.W, 0.0, literal.x,
+; EG-NEXT: ASHR T22.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T42.Z, T21.X, literal.x,
+; EG-NEXT: ASHR * T44.W, T21.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T43.X, T40.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T20.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T44.Z, T40.W, literal.x,
-; EG-NEXT: ASHR * T45.W, T39.X, literal.y,
+; EG-NEXT: BFE_INT T42.X, T21.X, 0.0, literal.x,
+; EG-NEXT: ASHR * T43.Y, PV.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T44.X, T40.W, 0.0, literal.x,
-; EG-NEXT: ASHR T43.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T45.Z, T39.X, literal.x,
-; EG-NEXT: ASHR * T40.W, T39.Y, literal.y,
+; EG-NEXT: ALU clause starting at 131:
+; EG-NEXT: ASHR T44.Z, T21.Z, literal.x,
+; EG-NEXT: ASHR * T22.W, T1.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T45.X, T39.X, 0.0, literal.x,
-; EG-NEXT: ASHR T44.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T40.Z, T39.Y, literal.x,
-; EG-NEXT: ASHR * T46.W, T39.Z, literal.y,
+; EG-NEXT: BFE_INT T44.X, T21.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T42.Y, T42.X, literal.y,
+; EG-NEXT: ASHR T22.Z, T1.W, literal.x,
+; EG-NEXT: ASHR * T43.W, T1.Z, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T40.X, T39.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T45.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T46.Z, T39.Z, literal.x,
-; EG-NEXT: ASHR * T47.W, T39.W, literal.y,
+; EG-NEXT: BFE_INT T21.X, T20.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T44.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T43.Z, T1.Z, literal.x,
+; EG-NEXT: ASHR * T45.W, T20.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T46.X, T39.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T40.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T47.Z, T39.W, literal.x,
-; EG-NEXT: ASHR * T48.W, T38.X, literal.y,
+; EG-NEXT: BFE_INT T46.X, T20.W, 0.0, literal.x,
+; EG-NEXT: ASHR T21.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T45.Z, T20.X, literal.x,
+; EG-NEXT: ASHR * T47.W, T20.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T47.X, T39.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T45.X, T20.X, 0.0, literal.x,
; EG-NEXT: ASHR T46.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T48.Z, T38.X, literal.x,
-; EG-NEXT: ASHR * T39.W, T38.Y, literal.y,
+; EG-NEXT: ASHR T47.Z, T20.Z, literal.x,
+; EG-NEXT: ASHR * T21.W, T1.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T48.X, T38.X, 0.0, literal.x,
-; EG-NEXT: ASHR T47.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T39.Z, T38.Y, literal.x,
-; EG-NEXT: ASHR * T49.W, T38.Z, literal.y,
+; EG-NEXT: BFE_INT T47.X, T20.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T45.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T21.Z, T1.Y, literal.x,
+; EG-NEXT: ASHR * T46.W, T0.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T39.X, T38.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T48.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T49.Z, T38.Z, literal.x,
-; EG-NEXT: ASHR * T50.W, T38.W, literal.y,
+; EG-NEXT: BFE_INT T20.X, T19.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T47.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T46.Z, T0.W, literal.x,
+; EG-NEXT: ASHR * T48.W, T19.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T49.X, T38.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T39.Y, PV.X, literal.y,
-; EG-NEXT: ASHR * T50.Z, T38.W, literal.x,
+; EG-NEXT: BFE_INT T49.X, T19.W, 0.0, literal.x,
+; EG-NEXT: ASHR T20.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T48.Z, T19.X, literal.x,
+; EG-NEXT: ASHR * T50.W, T19.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T50.X, T38.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T48.X, T19.X, 0.0, literal.x,
; EG-NEXT: ASHR T49.Y, PV.X, literal.y,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR T50.Z, T19.Z, literal.x,
+; EG-NEXT: ASHR * T20.W, T0.Z, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T38.X, PV.W, literal.x,
-; EG-NEXT: ASHR * T50.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T50.X, T19.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T48.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T20.Z, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR * T49.W, T0.Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T19.X, PV.W, literal.x,
+; EG-NEXT: ASHR T50.Y, PV.X, literal.y,
+; EG-NEXT: ASHR * T49.Z, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
;
; GFX12-LABEL: constant_sextload_v32i16_to_v32i64:
; GFX12: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 889755c23bbc72..8e40a60841daba 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -238,17 +238,28 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 27, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT MSKOR T6.XW, T8.X
-; EG-NEXT: MEM_RAT MSKOR T5.XW, T7.X
+; EG-NEXT: ALU 29, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
+; EG-NEXT: MEM_RAT MSKOR T6.XW, T7.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T5.X, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: MOV * T2.X, T5.X,
+; EG-NEXT: MOV T2.X, T5.X,
+; EG-NEXT: MOV * T3.X, T5.X,
; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Z, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T5.X, PV.W, PS,
+; EG-NEXT: LSHL * T5.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
@@ -261,19 +272,10 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: LSHL * T6.W, literal.x, PS,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T6.Y, 0.0,
-; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
-; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T5.X, T2.W, PV.W,
-; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV T6.Z, 0.0,
-; EG-NEXT: MOV * T5.Z, 0.0,
-; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
+; EG-NEXT: MOV T5.Z, 0.0,
+; EG-NEXT: MOV * T6.Z, 0.0,
+; EG-NEXT: LSHR T7.X, T0.W, literal.x,
+; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_load_v3i8:
@@ -1088,23 +1090,25 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
+; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T5.Y, T4.X, literal.x, PV.W,
+; EG-NEXT: BFE_UINT * T4.Y, PV.Y, literal.x, PV.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T5.X, T4.X, literal.x,
-; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T4.X, T0.Y, literal.x,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
-; EG-NEXT: BFE_UINT T4.X, T4.X, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T6.X, T0.Y, literal.x, T0.W,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
@@ -1192,26 +1196,28 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T4.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XY, T5.X, 1
+; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T7.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T4.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
-; EG-NEXT: LSHR * T0.W, T4.X, literal.y,
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: BFE_INT * T6.X, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T5.X, PV.W, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T7.X, T4.X, 0.0, literal.x,
-; EG-NEXT: LSHR T0.W, T4.X, literal.x,
+; EG-NEXT: BFE_INT T6.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T4.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: LSHR T7.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T6.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v3i8_to_v3i32:
@@ -1300,7 +1306,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -1309,14 +1315,16 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T4.Z, T4.X, literal.x, PV.W,
+; EG-NEXT: BFE_UINT * T4.Z, PV.Y, literal.x, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T4.Y, T4.X, literal.x, T0.W,
-; EG-NEXT: LSHR * T4.W, T4.X, literal.y,
+; EG-NEXT: BFE_UINT T4.Y, T0.Y, literal.x, T0.W,
+; EG-NEXT: LSHR * T4.W, T0.Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
+; EG-NEXT: AND_INT T4.X, T0.Y, literal.x,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
;
@@ -1407,8 +1415,8 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T4.X, 1
+; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
@@ -1416,17 +1424,19 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T4.X, literal.y,
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: BFE_INT T4.X, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T4.X, literal.y,
+; EG-NEXT: BFE_INT T4.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T5.Z, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T4.X, literal.x,
+; EG-NEXT: BFE_INT T4.Z, PS, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
-; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT * T4.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i32:
@@ -1550,32 +1560,36 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
+; EG-NEXT: ALU 24, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: MOV T3.X, T5.X,
+; EG-NEXT: MOV * T2.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: MOV T0.Z, PV.X,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T6.Z, T5.X, literal.x, PV.W,
+; EG-NEXT: BFE_UINT * T5.Z, PV.Z, literal.x, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T6.Y, T5.X, literal.x, T0.W,
-; EG-NEXT: BFE_UINT T7.Z, T5.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T6.W, T5.X, literal.z,
+; EG-NEXT: BFE_UINT T5.Y, T0.Z, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T6.Z, T0.Y, literal.y, T0.W,
+; EG-NEXT: LSHR * T5.W, T0.Z, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T6.X, T5.X, literal.x,
-; EG-NEXT: BFE_UINT T7.Y, T5.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.z,
+; EG-NEXT: AND_INT T5.X, T0.Z, literal.x,
+; EG-NEXT: BFE_UINT T6.Y, T0.Y, literal.y, T0.W,
+; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.z,
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR * T7.W, T5.Y, literal.x,
+; EG-NEXT: LSHR * T6.W, T0.Y, literal.x,
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T7.X, T5.Y, literal.x,
+; EG-NEXT: AND_INT T6.X, T0.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
; EG-NEXT: LSHR * T8.X, PV.W, literal.x,
@@ -1711,38 +1725,42 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 23, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
+; EG-NEXT: ALU 27, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T5.X, literal.y,
+; EG-NEXT: MOV T3.X, T5.X,
+; EG-NEXT: MOV * T2.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: MOV * T0.Z, PV.X,
+; EG-NEXT: BFE_INT T5.X, PV.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.Z, literal.y,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Z, T5.Y, literal.y,
-; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T5.X, literal.z,
+; EG-NEXT: BFE_INT T6.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T1.Z, T0.Y, literal.y,
+; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.z,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T0.Y, T5.Y, literal.x,
-; EG-NEXT: BFE_INT T6.Z, PS, 0.0, literal.y,
-; EG-NEXT: BFE_INT T7.W, PV.Z, 0.0, literal.y,
-; EG-NEXT: LSHR * T0.W, T5.X, literal.y,
+; EG-NEXT: LSHR T1.Y, T0.Y, literal.x,
+; EG-NEXT: BFE_INT T5.Z, PS, 0.0, literal.y,
+; EG-NEXT: BFE_INT T6.W, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
-; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
-; EG-NEXT: BFE_INT T6.Y, PS, 0.0, literal.y,
-; EG-NEXT: BFE_INT T7.Z, PV.Y, 0.0, literal.y,
-; EG-NEXT: LSHR T0.W, T5.Y, literal.y,
+; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T5.Y, PS, 0.0, literal.y,
+; EG-NEXT: BFE_INT T6.Z, PV.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.y,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T8.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT * T6.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i32:
@@ -1945,51 +1963,59 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 39, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 47, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T13.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T11.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: MOV * T0.W, literal.x,
+; EG-NEXT: MOV T5.X, T7.X,
+; EG-NEXT: MOV * T4.X, T7.Y,
+; EG-NEXT: MOV T3.X, T7.Z,
+; EG-NEXT: MOV * T2.X, T7.W,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: MOV T0.Z, PV.X,
+; EG-NEXT: MOV T0.W, T4.X,
+; EG-NEXT: MOV * T1.Y, T5.X,
+; EG-NEXT: MOV * T1.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T8.Z, T7.X, literal.x, PV.W,
+; EG-NEXT: BFE_UINT * T7.Z, T1.Y, literal.x, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T8.Y, T7.X, literal.x, T0.W,
-; EG-NEXT: BFE_UINT T9.Z, T7.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T8.W, T7.X, literal.z,
+; EG-NEXT: BFE_UINT T7.Y, T1.Y, literal.x, T1.W,
+; EG-NEXT: BFE_UINT T8.Z, T0.W, literal.y, T1.W,
+; EG-NEXT: LSHR * T7.W, T1.Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T8.X, T7.X, literal.x,
-; EG-NEXT: BFE_UINT T9.Y, T7.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.z,
+; EG-NEXT: AND_INT T7.X, T1.Y, literal.x,
+; EG-NEXT: BFE_UINT T8.Y, T0.W, literal.y, T1.W,
+; EG-NEXT: LSHR * T9.X, KC0[2].Y, literal.z,
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T10.Z, T7.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T9.W, T7.Y, literal.y,
+; EG-NEXT: BFE_UINT T10.Z, T0.Z, literal.x, T1.W,
+; EG-NEXT: LSHR * T8.W, T0.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T9.X, T7.Y, literal.x,
-; EG-NEXT: BFE_UINT T10.Y, T7.Z, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: AND_INT T8.X, T0.W, literal.x,
+; EG-NEXT: BFE_UINT T10.Y, T0.Z, literal.y, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T11.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T12.Z, T7.W, literal.y, T0.W,
-; EG-NEXT: LSHR T10.W, T7.Z, literal.z,
-; EG-NEXT: AND_INT * T10.X, T7.Z, literal.w,
+; EG-NEXT: BFE_UINT T12.Z, T0.Y, literal.y, T1.W,
+; EG-NEXT: LSHR T10.W, T0.Z, literal.z,
+; EG-NEXT: AND_INT * T10.X, T0.Z, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T12.Y, T7.W, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T12.Y, T0.Y, literal.x, T1.W,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
; EG-NEXT: LSHR T13.X, PV.W, literal.x,
-; EG-NEXT: LSHR T12.W, T7.W, literal.y,
-; EG-NEXT: AND_INT * T12.X, T7.W, literal.z,
+; EG-NEXT: LSHR T12.W, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T12.X, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
@@ -2213,64 +2239,72 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 47, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T7.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T8.X, 1
+; EG-NEXT: ALU 55, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T13.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT: LSHR T0.W, T7.W, literal.y,
-; EG-NEXT: LSHR * T1.W, T7.Z, literal.z,
+; EG-NEXT: MOV T5.X, T7.X,
+; EG-NEXT: MOV * T4.X, T7.Y,
+; EG-NEXT: MOV T3.X, T7.Z,
+; EG-NEXT: MOV * T2.X, T7.W,
+; EG-NEXT: MOV T0.Y, T4.X,
+; EG-NEXT: MOV T0.Z, T5.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: MOV * T1.Y, PS,
+; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T1.W, PS, literal.y,
+; EG-NEXT: LSHR * T2.W, PV.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T9.X, T7.X, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Y, T7.W, literal.y,
-; EG-NEXT: LSHR T0.Z, T7.Z, literal.z,
-; EG-NEXT: LSHR T2.W, T7.Y, literal.x,
-; EG-NEXT: LSHR * T3.W, T7.X, literal.y,
+; EG-NEXT: BFE_INT T8.X, T0.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T2.Y, T1.Y, literal.y,
+; EG-NEXT: LSHR T1.Z, T0.W, literal.z,
+; EG-NEXT: LSHR T3.W, T0.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T4.W, T0.Z, literal.y,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T10.X, T7.Y, 0.0, literal.x,
-; EG-NEXT: LSHR T1.Y, T7.Z, literal.y,
-; EG-NEXT: LSHR T1.Z, T7.Y, literal.y,
-; EG-NEXT: BFE_INT T9.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T3.W, T7.X, literal.z,
+; EG-NEXT: BFE_INT T9.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T3.Y, T0.W, literal.y,
+; EG-NEXT: LSHR T2.Z, T0.Y, literal.y,
+; EG-NEXT: BFE_INT T8.W, PS, 0.0, literal.x,
+; EG-NEXT: LSHR * T4.W, T0.Z, literal.z,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T11.X, T7.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T2.Y, T7.Y, literal.y,
-; EG-NEXT: BFE_INT T9.Z, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T10.W, PV.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T3.W, T7.X, literal.x,
+; EG-NEXT: BFE_INT T10.X, T0.W, 0.0, literal.x,
+; EG-NEXT: LSHR T0.Y, T0.Y, literal.y,
+; EG-NEXT: BFE_INT T8.Z, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T9.W, PV.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T12.X, T7.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T9.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T10.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T11.W, T1.Y, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_INT T11.X, T1.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T8.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T9.Z, PV.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T10.W, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: LSHR T7.X, PS, literal.x,
-; EG-NEXT: BFE_INT T10.Y, T2.W, 0.0, literal.y,
-; EG-NEXT: BFE_INT T11.Z, T0.Z, 0.0, literal.y,
-; EG-NEXT: BFE_INT T12.W, T0.Y, 0.0, literal.y,
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T12.X, PS, literal.x,
+; EG-NEXT: BFE_INT T9.Y, T3.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT T10.Z, T1.Z, 0.0, literal.y,
+; EG-NEXT: BFE_INT T11.W, T2.Y, 0.0, literal.y,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T13.X, PS, literal.x,
-; EG-NEXT: BFE_INT T11.Y, T1.W, 0.0, literal.y,
-; EG-NEXT: BFE_INT T12.Z, T0.W, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T0.W, T7.W, literal.y, BS:VEC_201
+; EG-NEXT: BFE_INT T10.Y, T2.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT T11.Z, T1.W, 0.0, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T0.W, T1.Y, literal.y,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T14.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T12.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT * T11.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i32:
@@ -2627,94 +2661,114 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
;
; EG-LABEL: constant_zextload_v32i8_to_v32i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @12
-; EG-NEXT: ALU 75, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @14
+; EG-NEXT: ALU 3, @19, KC0[], KC1[]
+; EG-NEXT: TEX 0 @16
+; EG-NEXT: ALU 87, @23, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T23.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T18.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T21.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T17.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
-; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; EG-NEXT: ALU clause starting at 16:
+; EG-NEXT: Fetch clause starting at 14:
+; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; EG-NEXT: Fetch clause starting at 16:
+; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; EG-NEXT: ALU clause starting at 18:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 17:
-; EG-NEXT: MOV * T0.W, literal.x,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: MOV T9.X, T12.X,
+; EG-NEXT: MOV * T8.X, T12.Y,
+; EG-NEXT: MOV T7.X, T12.Z,
+; EG-NEXT: MOV * T6.X, T12.W,
+; EG-NEXT: ALU clause starting at 23:
+; EG-NEXT: MOV T5.X, T11.X,
+; EG-NEXT: MOV * T4.X, T11.Y,
+; EG-NEXT: MOV T3.X, T11.Z,
+; EG-NEXT: MOV * T2.X, T11.W,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: MOV T0.Z, PV.X,
+; EG-NEXT: MOV T0.W, T4.X,
+; EG-NEXT: MOV * T1.Y, T5.X,
+; EG-NEXT: MOV T1.Z, T6.X,
+; EG-NEXT: MOV * T1.W, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.Y, T8.X,
+; EG-NEXT: MOV T2.Z, T9.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T13.Z, T11.X, literal.x, PV.W,
+; EG-NEXT: BFE_UINT * T11.Z, PV.Z, literal.x, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, T0.W,
-; EG-NEXT: BFE_UINT T14.Z, T11.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T13.W, T11.X, literal.z,
+; EG-NEXT: BFE_UINT T11.Y, T2.Z, literal.x, T2.W,
+; EG-NEXT: BFE_UINT T12.Z, T2.Y, literal.y, T2.W,
+; EG-NEXT: LSHR * T11.W, T2.Z, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T13.X, T11.X, literal.x,
-; EG-NEXT: BFE_UINT T14.Y, T11.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.z,
+; EG-NEXT: AND_INT T11.X, T2.Z, literal.x,
+; EG-NEXT: BFE_UINT T12.Y, T2.Y, literal.y, T2.W,
+; EG-NEXT: LSHR * T13.X, KC0[2].Y, literal.z,
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T15.Z, T11.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T14.W, T11.Y, literal.y,
+; EG-NEXT: BFE_UINT T14.Z, T1.W, literal.x, T2.W,
+; EG-NEXT: LSHR * T12.W, T2.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T14.X, T11.Y, literal.x,
-; EG-NEXT: BFE_UINT T15.Y, T11.Z, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: AND_INT T12.X, T2.Y, literal.x,
+; EG-NEXT: BFE_UINT T14.Y, T1.W, literal.y, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T16.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T17.Z, T11.W, literal.y, T0.W,
-; EG-NEXT: LSHR T15.W, T11.Z, literal.z,
-; EG-NEXT: AND_INT * T15.X, T11.Z, literal.w,
+; EG-NEXT: LSHR T15.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T16.Z, T1.Z, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: LSHR T14.W, T1.W, literal.z,
+; EG-NEXT: AND_INT * T14.X, T1.W, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T17.Y, T11.W, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T16.Y, T1.Z, literal.x, T2.W,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT: LSHR T18.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T19.Z, T12.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT: LSHR T17.W, T11.W, literal.z,
-; EG-NEXT: AND_INT * T17.X, T11.W, literal.w,
+; EG-NEXT: LSHR T17.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T18.Z, T1.Y, literal.y, T2.W,
+; EG-NEXT: LSHR T16.W, T1.Z, literal.z,
+; EG-NEXT: AND_INT * T16.X, T1.Z, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T19.Y, T12.X, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T18.Y, T1.Y, literal.x, T2.W,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR T20.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T21.Z, T12.Y, literal.y, T0.W,
-; EG-NEXT: LSHR T19.W, T12.X, literal.z,
-; EG-NEXT: AND_INT * T19.X, T12.X, literal.w,
+; EG-NEXT: LSHR T19.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T20.Z, T0.W, literal.y, T2.W,
+; EG-NEXT: LSHR T18.W, T1.Y, literal.z,
+; EG-NEXT: AND_INT * T18.X, T1.Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T21.Y, T12.Y, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T20.Y, T0.W, literal.x, T2.W,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 64(8.968310e-44)
-; EG-NEXT: LSHR T12.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T22.Z, T12.Z, literal.y, T0.W,
-; EG-NEXT: LSHR T21.W, T12.Y, literal.z,
-; EG-NEXT: AND_INT * T21.X, T12.Y, literal.w,
+; EG-NEXT: LSHR T21.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T22.Z, T0.Z, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: LSHR T20.W, T0.W, literal.z,
+; EG-NEXT: AND_INT * T20.X, T0.W, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T22.Y, T12.Z, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_UINT T22.Y, T0.Z, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 80(1.121039e-43)
; EG-NEXT: LSHR T23.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T24.Z, T12.W, literal.y, T0.W,
-; EG-NEXT: LSHR T22.W, T12.Z, literal.z,
-; EG-NEXT: AND_INT * T22.X, T12.Z, literal.w,
+; EG-NEXT: BFE_UINT T24.Z, T0.Y, literal.y, T2.W,
+; EG-NEXT: LSHR T22.W, T0.Z, literal.z,
+; EG-NEXT: AND_INT * T22.X, T0.Z, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T24.Y, T12.W, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T24.Y, T0.Y, literal.x, T2.W,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 96(1.345247e-43)
; EG-NEXT: LSHR T25.X, PV.W, literal.x,
-; EG-NEXT: LSHR T24.W, T12.W, literal.y,
-; EG-NEXT: AND_INT * T24.X, T12.W, literal.z,
+; EG-NEXT: LSHR T24.W, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T24.X, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
@@ -3111,122 +3165,142 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
;
; EG-LABEL: constant_sextload_v32i8_to_v32i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @14
-; EG-NEXT: ALU 18, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @16
-; EG-NEXT: ALU 75, @38, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T17.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T16.X, 0
+; EG-NEXT: ALU 3, @21, KC0[], KC1[]
+; EG-NEXT: TEX 0 @18
+; EG-NEXT: ALU 81, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 25, @107, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T24.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T15.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T13.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T13.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T11.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 14:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
+; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 16:
-; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; EG-NEXT: ALU clause starting at 18:
+; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; EG-NEXT: Fetch clause starting at 18:
+; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; EG-NEXT: ALU clause starting at 20:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 19:
-; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: ALU clause starting at 21:
+; EG-NEXT: MOV T9.X, T12.X,
+; EG-NEXT: MOV * T8.X, T12.Y,
+; EG-NEXT: MOV T7.X, T12.Z,
+; EG-NEXT: MOV * T6.X, T12.W,
+; EG-NEXT: ALU clause starting at 25:
+; EG-NEXT: MOV T5.X, T11.X,
+; EG-NEXT: MOV * T4.X, T11.Y,
+; EG-NEXT: MOV T3.X, T11.Z,
+; EG-NEXT: MOV * T2.X, T11.W,
+; EG-NEXT: MOV T0.Y, T4.X,
+; EG-NEXT: MOV T0.Z, PV.X,
+; EG-NEXT: MOV * T0.W, PS,
+; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T14.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T12.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T15.X, PV.W, literal.x,
-; EG-NEXT: LSHR T0.Z, T12.W, literal.y,
-; EG-NEXT: LSHR T0.W, T12.Z, literal.z,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
+; EG-NEXT: LSHR T13.X, PV.W, literal.x,
+; EG-NEXT: LSHR T1.Z, T0.W, literal.y,
+; EG-NEXT: LSHR T1.W, T0.Z, literal.z,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR T16.X, PS, literal.x,
-; EG-NEXT: LSHR T0.Y, T12.W, literal.y,
-; EG-NEXT: LSHR T1.Z, T12.Z, literal.z,
-; EG-NEXT: LSHR T1.W, T12.Y, literal.w,
-; EG-NEXT: LSHR * T2.W, T12.Z, literal.y,
+; EG-NEXT: LSHR T14.X, PS, literal.x,
+; EG-NEXT: LSHR T1.Y, T0.W, literal.y,
+; EG-NEXT: LSHR T2.Z, T0.Z, literal.z,
+; EG-NEXT: LSHR T2.W, T0.Y, literal.w,
+; EG-NEXT: LSHR * T3.W, T0.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
-; EG-NEXT: ALU clause starting at 38:
-; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T2.Y, T8.X,
+; EG-NEXT: MOV * T3.Y, T7.X,
+; EG-NEXT: MOV * T3.Z, T6.X,
+; EG-NEXT: MOV T4.Y, T9.X,
+; EG-NEXT: MOV T4.Z, T5.X, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.x,
; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T17.X, PV.W, literal.x,
-; EG-NEXT: LSHR T1.Y, T12.Y, literal.y,
-; EG-NEXT: LSHR T2.Z, T12.Y, literal.z,
-; EG-NEXT: LSHR T3.W, T12.X, literal.y,
-; EG-NEXT: LSHR * T4.W, T12.X, literal.z,
+; EG-NEXT: LSHR T15.X, PV.W, literal.x,
+; EG-NEXT: LSHR T5.Y, T0.Y, literal.y,
+; EG-NEXT: LSHR T5.Z, T0.Y, literal.z,
+; EG-NEXT: LSHR T4.W, PV.Z, literal.y,
+; EG-NEXT: LSHR * T5.W, PV.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T18.X, T11.X, 0.0, literal.x,
-; EG-NEXT: LSHR T2.Y, T11.W, literal.y,
-; EG-NEXT: LSHR T3.Z, T11.W, literal.z,
-; EG-NEXT: LSHR T5.W, T11.Z, literal.y,
-; EG-NEXT: LSHR * T6.W, T11.X, literal.z,
+; EG-NEXT: BFE_INT T16.X, T4.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T6.Y, T3.Z, literal.y,
+; EG-NEXT: LSHR T6.Z, T3.Z, literal.z,
+; EG-NEXT: LSHR T6.W, T3.Y, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T7.W, T4.Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T19.X, T11.Y, 0.0, literal.x,
-; EG-NEXT: LSHR T3.Y, T11.Z, literal.y,
-; EG-NEXT: LSHR T4.Z, T11.Y, literal.y,
-; EG-NEXT: BFE_INT T18.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T11.X, literal.z,
+; EG-NEXT: BFE_INT T17.X, T2.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T7.Y, T3.Y, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T7.Z, T2.Y, literal.y,
+; EG-NEXT: BFE_INT T16.W, PS, 0.0, literal.x,
+; EG-NEXT: LSHR * T7.W, T4.Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T20.X, T11.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T4.Y, T11.Y, literal.y,
-; EG-NEXT: BFE_INT T18.Z, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T19.W, PV.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T11.X, literal.x,
+; EG-NEXT: BFE_INT T18.X, T3.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T8.Y, T2.Y, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T16.Z, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T17.W, PV.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T7.W, T4.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T21.X, T11.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T19.X, T3.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T16.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T17.Z, PV.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T18.W, T7.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T7.W, T2.Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T20.X, T4.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T17.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T18.Z, T6.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T19.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T6.W, T3.Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T21.X, T0.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T18.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T19.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T20.W, T3.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T11.Y, literal.x,
+; EG-NEXT: BFE_INT T19.Z, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T20.W, T5.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T3.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T22.X, T12.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T22.X, T0.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T19.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T20.Z, T5.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.W, T3.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T5.W, T11.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T11.X, T12.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T20.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.Z, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: BFE_INT T22.W, T4.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T4.W, T11.W, literal.x,
+; EG-NEXT: BFE_INT * T20.Z, T4.W, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T23.X, T12.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T22.Z, T3.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T11.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T3.W, T12.X, literal.x,
+; EG-NEXT: ALU clause starting at 107:
+; EG-NEXT: BFE_INT T21.W, T5.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T4.W, T4.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T24.X, T12.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T22.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T11.Z, T1.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T23.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_INT T23.X, T0.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T20.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T21.Z, T5.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T22.W, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 80(1.121039e-43)
-; EG-NEXT: LSHR T12.X, PS, literal.x,
-; EG-NEXT: BFE_INT T11.Y, T1.W, 0.0, literal.y,
-; EG-NEXT: BFE_INT T23.Z, T1.Z, 0.0, literal.y,
-; EG-NEXT: BFE_INT T24.W, T0.Y, 0.0, literal.y,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T24.X, PS, literal.x,
+; EG-NEXT: BFE_INT T21.Y, T2.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT T22.Z, T2.Z, 0.0, literal.y,
+; EG-NEXT: BFE_INT T23.W, T1.Y, 0.0, literal.y,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T25.X, PS, literal.x,
-; EG-NEXT: BFE_INT T23.Y, T0.W, 0.0, literal.y,
-; EG-NEXT: BFE_INT T24.Z, T0.Z, 0.0, literal.y,
-; EG-NEXT: LSHR T0.W, T12.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T22.Y, T1.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT T23.Z, T1.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T0.W, T0.W, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T26.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T24.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT * T23.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v32i8_to_v32i32:
@@ -3902,181 +3976,225 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; EG-LABEL: constant_zextload_v64i8_to_v64i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @22
-; EG-NEXT: ALU 59, @31, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @26
-; EG-NEXT: ALU 88, @91, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @28
+; EG-NEXT: ALU 3, @37, KC0[], KC1[]
+; EG-NEXT: TEX 0 @30
+; EG-NEXT: ALU 3, @41, KC0[], KC1[]
+; EG-NEXT: TEX 0 @32
+; EG-NEXT: ALU 3, @45, KC0[], KC1[]
+; EG-NEXT: TEX 0 @34
+; EG-NEXT: ALU 95, @49, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 73, @145, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T49.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T32.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T44.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T33.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T47.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T45.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T43.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T41.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T39.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T37.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T35.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T31.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T22.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T35.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T33.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T31.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T27.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T22.XYZW, T21.X, 16, #1
-; EG-NEXT: VTX_READ_128 T23.XYZW, T21.X, 0, #1
-; EG-NEXT: Fetch clause starting at 26:
-; EG-NEXT: VTX_READ_128 T32.XYZW, T21.X, 48, #1
-; EG-NEXT: VTX_READ_128 T33.XYZW, T21.X, 32, #1
-; EG-NEXT: ALU clause starting at 30:
-; EG-NEXT: MOV * T21.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 31:
-; EG-NEXT: MOV * T0.W, literal.x,
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 28:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
+; EG-NEXT: Fetch clause starting at 30:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT: Fetch clause starting at 32:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; EG-NEXT: Fetch clause starting at 34:
+; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; EG-NEXT: ALU clause starting at 36:
+; EG-NEXT: MOV * T19.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 37:
+; EG-NEXT: MOV T17.X, T20.X,
+; EG-NEXT: MOV * T16.X, T20.Y,
+; EG-NEXT: MOV T15.X, T20.Z,
+; EG-NEXT: MOV * T14.X, T20.W,
+; EG-NEXT: ALU clause starting at 41:
+; EG-NEXT: MOV T13.X, T20.X,
+; EG-NEXT: MOV * T12.X, T20.Y,
+; EG-NEXT: MOV T11.X, T20.Z,
+; EG-NEXT: MOV * T10.X, T20.W,
+; EG-NEXT: ALU clause starting at 45:
+; EG-NEXT: MOV T9.X, T20.X,
+; EG-NEXT: MOV * T8.X, T20.Y,
+; EG-NEXT: MOV T7.X, T20.Z,
+; EG-NEXT: MOV * T6.X, T20.W,
+; EG-NEXT: ALU clause starting at 49:
+; EG-NEXT: MOV T5.X, T19.X,
+; EG-NEXT: MOV * T4.X, T19.Y,
+; EG-NEXT: MOV T3.X, T19.Z,
+; EG-NEXT: MOV * T2.X, T19.W,
+; EG-NEXT: MOV T0.Y, T11.X,
+; EG-NEXT: MOV T0.Z, T12.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T13.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T14.X,
+; EG-NEXT: MOV T1.Z, T15.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T16.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T17.X,
+; EG-NEXT: MOV * T2.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T19.Z, T23.X, literal.x, PV.W,
+; EG-NEXT: BFE_UINT * T19.Z, PV.Y, literal.x, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T19.Y, T23.X, literal.x, T0.W,
-; EG-NEXT: BFE_UINT T20.Z, T23.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T19.W, T23.X, literal.z,
+; EG-NEXT: BFE_UINT T19.Y, T2.Y, literal.x, T2.W,
+; EG-NEXT: BFE_UINT T20.Z, T1.W, literal.y, T2.W,
+; EG-NEXT: LSHR * T19.W, T2.Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T19.X, T23.X, literal.x,
-; EG-NEXT: BFE_UINT T20.Y, T23.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T23.X, KC0[2].Y, literal.z,
+; EG-NEXT: AND_INT T19.X, T2.Y, literal.x,
+; EG-NEXT: BFE_UINT T20.Y, T1.W, literal.y, T2.W,
+; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.z,
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T24.Z, T23.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T20.W, T23.Y, literal.y,
+; EG-NEXT: BFE_UINT T22.Z, T1.Z, literal.x, T2.W,
+; EG-NEXT: LSHR * T20.W, T1.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T20.X, T23.Y, literal.x,
-; EG-NEXT: BFE_UINT T24.Y, T23.Z, literal.y, T0.W,
+; EG-NEXT: AND_INT T20.X, T1.W, literal.x,
+; EG-NEXT: BFE_UINT T22.Y, T1.Z, literal.y, T2.W,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T25.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T26.Z, T23.W, literal.y, T0.W,
-; EG-NEXT: LSHR T24.W, T23.Z, literal.z,
-; EG-NEXT: AND_INT * T24.X, T23.Z, literal.w,
+; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T24.Z, T1.Y, literal.y, T2.W,
+; EG-NEXT: LSHR T22.W, T1.Z, literal.z,
+; EG-NEXT: AND_INT * T22.X, T1.Z, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T26.Y, T23.W, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T24.Y, T1.Y, literal.x, T2.W,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT: LSHR T27.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T28.Z, T22.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT: LSHR T26.W, T23.W, literal.z,
-; EG-NEXT: AND_INT * T26.X, T23.W, literal.w,
+; EG-NEXT: LSHR T25.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T26.Z, T0.W, literal.y, T2.W,
+; EG-NEXT: LSHR T24.W, T1.Y, literal.z,
+; EG-NEXT: AND_INT * T24.X, T1.Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T28.Y, T22.X, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T26.Y, T0.W, literal.x, T2.W,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR T29.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T30.Z, T22.Y, literal.y, T0.W,
-; EG-NEXT: LSHR T28.W, T22.X, literal.z,
-; EG-NEXT: AND_INT * T28.X, T22.X, literal.w,
+; EG-NEXT: LSHR T27.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T28.Z, T0.Z, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: LSHR T26.W, T0.W, literal.z,
+; EG-NEXT: AND_INT * T26.X, T0.W, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T30.Y, T22.Y, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_UINT T28.Y, T0.Z, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 64(8.968310e-44)
-; EG-NEXT: LSHR T22.X, PV.W, literal.x,
-; EG-NEXT: LSHR T30.W, T22.Y, literal.y,
-; EG-NEXT: AND_INT * T30.X, T22.Y, literal.z,
+; EG-NEXT: LSHR T29.X, PV.W, literal.x,
+; EG-NEXT: LSHR T28.W, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T28.X, T0.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T21.Z, T22.Z, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_UINT T30.Z, T0.Y, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43)
; EG-NEXT: LSHR T31.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT * T21.Y, T22.Z, literal.y, T0.W,
+; EG-NEXT: BFE_UINT T30.Y, T0.Y, literal.y, T2.W,
+; EG-NEXT: MOV T0.Z, T2.X,
+; EG-NEXT: MOV T0.W, T3.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.Y, T4.X,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT: ALU clause starting at 91:
-; EG-NEXT: BFE_UINT T34.Z, T22.W, literal.x, T0.W,
-; EG-NEXT: LSHR * T21.W, T22.Z, literal.y,
+; EG-NEXT: MOV T1.Z, T5.X,
+; EG-NEXT: MOV * T1.W, T6.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.Y, T7.X,
+; EG-NEXT: MOV T2.Z, T8.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.Y, T9.X,
+; EG-NEXT: MOV * T3.Z, T10.X,
+; EG-NEXT: BFE_UINT T32.Z, PV.Z, literal.x, T2.W,
+; EG-NEXT: LSHR * T30.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T21.X, T22.Z, literal.x,
-; EG-NEXT: BFE_UINT T34.Y, T22.W, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: AND_INT T30.X, T0.Y, literal.x,
+; EG-NEXT: BFE_UINT T32.Y, T3.Z, literal.y, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T35.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T36.Z, T33.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT: LSHR T34.W, T22.W, literal.z,
-; EG-NEXT: AND_INT * T34.X, T22.W, literal.w,
+; EG-NEXT: LSHR T33.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T34.Z, T3.Y, literal.y, T2.W,
+; EG-NEXT: LSHR T32.W, T3.Z, literal.z,
+; EG-NEXT: AND_INT * T32.X, T3.Z, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T36.Y, T33.X, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 112(1.569454e-43)
-; EG-NEXT: LSHR T37.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T38.Z, T33.Y, literal.y, T0.W,
-; EG-NEXT: LSHR T36.W, T33.X, literal.z,
-; EG-NEXT: AND_INT * T36.X, T33.X, literal.w,
+; EG-NEXT: BFE_UINT * T34.Y, T3.Y, literal.x, T2.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 145:
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
+; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T35.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T36.Z, T2.Z, literal.y, T2.W,
+; EG-NEXT: LSHR T34.W, T3.Y, literal.z,
+; EG-NEXT: AND_INT * T34.X, T3.Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T38.Y, T33.Y, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_UINT T36.Y, T2.Z, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 128(1.793662e-43)
-; EG-NEXT: LSHR T33.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T39.Z, T33.Z, literal.y, T0.W,
-; EG-NEXT: LSHR T38.W, T33.Y, literal.z,
-; EG-NEXT: AND_INT * T38.X, T33.Y, literal.w,
+; EG-NEXT: LSHR T37.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T38.Z, T2.Y, literal.y, T2.W,
+; EG-NEXT: LSHR T36.W, T2.Z, literal.z,
+; EG-NEXT: AND_INT * T36.X, T2.Z, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T39.Y, T33.Z, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_UINT T38.Y, T2.Y, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 144(2.017870e-43)
-; EG-NEXT: LSHR T40.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T41.Z, T33.W, literal.y, T0.W,
-; EG-NEXT: LSHR T39.W, T33.Z, literal.z,
-; EG-NEXT: AND_INT * T39.X, T33.Z, literal.w,
+; EG-NEXT: LSHR T39.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T40.Z, T1.W, literal.y, T2.W,
+; EG-NEXT: LSHR T38.W, T2.Y, literal.z,
+; EG-NEXT: AND_INT * T38.X, T2.Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T41.Y, T33.W, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_UINT T40.Y, T1.W, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 160(2.242078e-43)
-; EG-NEXT: LSHR T42.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T43.Z, T32.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT: LSHR T41.W, T33.W, literal.z,
-; EG-NEXT: AND_INT * T41.X, T33.W, literal.w,
+; EG-NEXT: LSHR T41.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T42.Z, T1.Z, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: LSHR T40.W, T1.W, literal.z,
+; EG-NEXT: AND_INT * T40.X, T1.W, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T43.Y, T32.X, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T42.Y, T1.Z, literal.x, T2.W,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 176(2.466285e-43)
-; EG-NEXT: LSHR T44.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T45.Z, T32.Y, literal.y, T0.W,
-; EG-NEXT: LSHR T43.W, T32.X, literal.z,
-; EG-NEXT: AND_INT * T43.X, T32.X, literal.w,
+; EG-NEXT: LSHR T43.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T44.Z, T1.Y, literal.y, T2.W,
+; EG-NEXT: LSHR T42.W, T1.Z, literal.z,
+; EG-NEXT: AND_INT * T42.X, T1.Z, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T45.Y, T32.Y, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T44.Y, T1.Y, literal.x, T2.W,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 192(2.690493e-43)
-; EG-NEXT: LSHR T32.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T46.Z, T32.Z, literal.y, T0.W,
-; EG-NEXT: LSHR T45.W, T32.Y, literal.z,
-; EG-NEXT: AND_INT * T45.X, T32.Y, literal.w,
+; EG-NEXT: LSHR T45.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T46.Z, T0.W, literal.y, T2.W,
+; EG-NEXT: LSHR T44.W, T1.Y, literal.z,
+; EG-NEXT: AND_INT * T44.X, T1.Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T46.Y, T32.Z, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T46.Y, T0.W, literal.x, T2.W,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 208(2.914701e-43)
; EG-NEXT: LSHR T47.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T48.Z, T32.W, literal.y, T0.W,
-; EG-NEXT: LSHR T46.W, T32.Z, literal.z,
-; EG-NEXT: AND_INT * T46.X, T32.Z, literal.w,
+; EG-NEXT: BFE_UINT T48.Z, T0.Z, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: LSHR T46.W, T0.W, literal.z,
+; EG-NEXT: AND_INT * T46.X, T0.W, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T48.Y, T32.W, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T48.Y, T0.Z, literal.x, T2.W,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 224(3.138909e-43)
; EG-NEXT: LSHR T49.X, PV.W, literal.x,
-; EG-NEXT: LSHR T48.W, T32.W, literal.y,
-; EG-NEXT: AND_INT * T48.X, T32.W, literal.z,
+; EG-NEXT: LSHR T48.W, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T48.X, T0.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
@@ -4822,231 +4940,271 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; EG-LABEL: constant_sextload_v64i8_to_v64i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @32, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @24
-; EG-NEXT: ALU 40, @33, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @28
-; EG-NEXT: ALU 76, @74, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 72, @151, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T49.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T19.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T35.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T34.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T33.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T32.X, 0
+; EG-NEXT: ALU 0, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @28
+; EG-NEXT: ALU 3, @37, KC0[], KC1[]
+; EG-NEXT: TEX 0 @30
+; EG-NEXT: ALU 3, @41, KC0[], KC1[]
+; EG-NEXT: TEX 0 @32
+; EG-NEXT: ALU 3, @45, KC0[], KC1[]
+; EG-NEXT: TEX 0 @34
+; EG-NEXT: ALU 90, @49, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 71, @140, KC0[], KC1[]
+; EG-NEXT: ALU 46, @212, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T50.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T49.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T48.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T31.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T30.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T29.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T28.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T27.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T24.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T23.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T22.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T27.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T22.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T21.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T20.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T19.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 24:
-; EG-NEXT: VTX_READ_128 T20.XYZW, T21.X, 32, #1
-; EG-NEXT: VTX_READ_128 T19.XYZW, T21.X, 48, #1
; EG-NEXT: Fetch clause starting at 28:
-; EG-NEXT: VTX_READ_128 T31.XYZW, T21.X, 0, #1
-; EG-NEXT: VTX_READ_128 T21.XYZW, T21.X, 16, #1
-; EG-NEXT: ALU clause starting at 32:
-; EG-NEXT: MOV * T21.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 33:
-; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
+; EG-NEXT: Fetch clause starting at 30:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT: Fetch clause starting at 32:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; EG-NEXT: Fetch clause starting at 34:
+; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; EG-NEXT: ALU clause starting at 36:
+; EG-NEXT: MOV * T19.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 37:
+; EG-NEXT: MOV T17.X, T20.X,
+; EG-NEXT: MOV * T16.X, T20.Y,
+; EG-NEXT: MOV T15.X, T20.Z,
+; EG-NEXT: MOV * T14.X, T20.W,
+; EG-NEXT: ALU clause starting at 41:
+; EG-NEXT: MOV T13.X, T20.X,
+; EG-NEXT: MOV * T12.X, T20.Y,
+; EG-NEXT: MOV T11.X, T20.Z,
+; EG-NEXT: MOV * T10.X, T20.W,
+; EG-NEXT: ALU clause starting at 45:
+; EG-NEXT: MOV T9.X, T20.X,
+; EG-NEXT: MOV * T8.X, T20.Y,
+; EG-NEXT: MOV T7.X, T20.Z,
+; EG-NEXT: MOV * T6.X, T20.W,
+; EG-NEXT: ALU clause starting at 49:
+; EG-NEXT: MOV T5.X, T19.X,
+; EG-NEXT: MOV * T4.X, T19.Y,
+; EG-NEXT: MOV T3.X, T19.Z,
+; EG-NEXT: MOV * T2.X, T19.W,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: MOV T0.Z, T4.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: MOV * T1.Y, PS,
+; EG-NEXT: LSHR T19.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T23.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T20.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T24.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T21.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT: LSHR T25.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T22.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR T26.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT: LSHR T27.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T24.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT: LSHR T28.X, PV.W, literal.x,
-; EG-NEXT: LSHR T0.Y, T19.W, literal.y,
-; EG-NEXT: LSHR T0.Z, T19.Z, literal.z,
-; EG-NEXT: LSHR * T0.W, T19.W, literal.w,
+; EG-NEXT: LSHR T25.X, PV.W, literal.x,
+; EG-NEXT: LSHR T2.Y, T1.Y, literal.y,
+; EG-NEXT: LSHR T1.Z, T0.W, literal.z,
+; EG-NEXT: LSHR * T1.W, T1.Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T29.X, PV.W, literal.x,
-; EG-NEXT: LSHR T1.Y, T19.Z, literal.y,
-; EG-NEXT: LSHR T1.Z, T19.Y, literal.z,
-; EG-NEXT: LSHR * T1.W, T19.Z, literal.w,
+; EG-NEXT: LSHR T26.X, PV.W, literal.x,
+; EG-NEXT: LSHR T3.Y, T0.W, literal.y,
+; EG-NEXT: LSHR T2.Z, T0.Z, literal.z,
+; EG-NEXT: LSHR * T2.W, T0.W, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T30.X, PV.W, literal.x,
-; EG-NEXT: LSHR T2.Y, T19.Y, literal.y,
-; EG-NEXT: LSHR T2.Z, T19.Y, literal.z,
-; EG-NEXT: LSHR T2.W, T19.X, literal.y,
-; EG-NEXT: LSHR * T3.W, T19.X, literal.z,
+; EG-NEXT: LSHR T27.X, PV.W, literal.x,
+; EG-NEXT: LSHR T4.Y, T0.Z, literal.y,
+; EG-NEXT: LSHR T3.Z, T0.Z, literal.z,
+; EG-NEXT: LSHR T3.W, T0.Y, literal.y,
+; EG-NEXT: LSHR * T4.W, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 74:
-; EG-NEXT: LSHR T3.Y, T20.W, literal.x,
-; EG-NEXT: LSHR T3.Z, T20.W, literal.y,
-; EG-NEXT: LSHR T4.W, T20.Z, literal.x,
-; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.z,
+; EG-NEXT: MOV * T4.Z, T16.X,
+; EG-NEXT: MOV T5.Y, T15.X,
+; EG-NEXT: MOV T5.Z, T14.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T5.W, T17.X, BS:VEC_201
+; EG-NEXT: MOV T6.Y, T13.X,
+; EG-NEXT: MOV T6.Z, T12.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T6.W, T11.X, BS:VEC_201
+; EG-NEXT: MOV T7.Y, T10.X,
+; EG-NEXT: MOV T7.Z, T9.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T7.W, T8.X, BS:VEC_201
+; EG-NEXT: MOV T8.Y, T7.X,
+; EG-NEXT: MOV * T8.Z, T6.X, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T9.Y, PV.Z, literal.x,
+; EG-NEXT: LSHR T9.Z, PV.Z, literal.y,
+; EG-NEXT: LSHR T8.W, PV.Y, literal.x,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.z,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T32.X, PS, literal.x,
-; EG-NEXT: LSHR T4.Y, T20.Z, literal.y,
-; EG-NEXT: LSHR T4.Z, T20.Y, literal.z,
-; EG-NEXT: LSHR T5.W, T20.Y, literal.y,
-; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.w,
+; EG-NEXT: LSHR T28.X, PS, literal.x,
+; EG-NEXT: LSHR T10.Y, T8.Y, literal.y,
+; EG-NEXT: LSHR T10.Z, T7.W, literal.z,
+; EG-NEXT: LSHR T9.W, T7.W, literal.y,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 160(2.242078e-43)
-; EG-NEXT: LSHR T33.X, PS, literal.x,
-; EG-NEXT: LSHR T5.Y, T20.X, literal.y,
-; EG-NEXT: LSHR T5.Z, T20.X, literal.z,
-; EG-NEXT: LSHR T6.W, T21.W, literal.y,
-; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.w,
+; EG-NEXT: LSHR T29.X, PS, literal.x,
+; EG-NEXT: LSHR T11.Y, T7.Z, literal.y,
+; EG-NEXT: LSHR T11.Z, T7.Z, literal.z,
+; EG-NEXT: LSHR T10.W, T7.Y, literal.y,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 176(2.466285e-43)
-; EG-NEXT: LSHR T34.X, PS, literal.x,
-; EG-NEXT: LSHR T6.Y, T21.W, literal.y,
-; EG-NEXT: LSHR T6.Z, T21.Z, literal.z,
-; EG-NEXT: LSHR T7.W, T21.Z, literal.y,
-; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.w,
+; EG-NEXT: LSHR T30.X, PS, literal.x,
+; EG-NEXT: LSHR T12.Y, T7.Y, literal.y,
+; EG-NEXT: LSHR T12.Z, T6.W, literal.z,
+; EG-NEXT: LSHR T11.W, T6.W, literal.y,
+; EG-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 192(2.690493e-43)
-; EG-NEXT: LSHR T35.X, PS, literal.x,
-; EG-NEXT: LSHR T7.Y, T21.Y, literal.y,
-; EG-NEXT: LSHR T7.Z, T21.Y, literal.z,
-; EG-NEXT: LSHR T8.W, T21.X, literal.y,
-; EG-NEXT: LSHR * T9.W, T21.X, literal.z,
+; EG-NEXT: LSHR T31.X, PS, literal.x,
+; EG-NEXT: LSHR * T13.Y, T6.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T36.X, T31.X, 0.0, literal.x,
-; EG-NEXT: LSHR T8.Y, T31.W, literal.y,
-; EG-NEXT: LSHR T8.Z, T31.W, literal.z,
-; EG-NEXT: LSHR T10.W, T31.Z, literal.y,
-; EG-NEXT: LSHR * T11.W, T31.X, literal.z,
+; EG-NEXT: ALU clause starting at 140:
+; EG-NEXT: LSHR T13.Z, T6.Z, literal.x,
+; EG-NEXT: LSHR T12.W, T6.Y, literal.y,
+; EG-NEXT: LSHR * T13.W, T6.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
+; EG-NEXT: BFE_INT T32.X, T5.W, 0.0, literal.x,
+; EG-NEXT: LSHR T14.Y, T5.Z, literal.y,
+; EG-NEXT: LSHR T14.Z, T5.Z, literal.z,
+; EG-NEXT: LSHR T14.W, T5.Y, literal.y,
+; EG-NEXT: LSHR * T15.W, T5.W, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T37.X, T31.Y, 0.0, literal.x,
-; EG-NEXT: LSHR T9.Y, T31.Z, literal.y,
-; EG-NEXT: LSHR T9.Z, T31.Y, literal.y,
-; EG-NEXT: BFE_INT T36.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T11.W, T31.X, literal.z,
+; EG-NEXT: BFE_INT T33.X, T4.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T15.Y, T5.Y, literal.y,
+; EG-NEXT: LSHR T15.Z, T4.Z, literal.y,
+; EG-NEXT: BFE_INT T32.W, PS, 0.0, literal.x,
+; EG-NEXT: LSHR * T15.W, T5.W, literal.z,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T38.X, T31.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T10.Y, T31.Y, literal.y,
-; EG-NEXT: BFE_INT T36.Z, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T37.W, PV.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T11.W, T31.X, literal.x,
+; EG-NEXT: BFE_INT T34.X, T5.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T16.Y, T4.Z, literal.y,
+; EG-NEXT: BFE_INT T32.Z, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T33.W, PV.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T5.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T39.X, T31.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T35.X, T5.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T32.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T33.Z, PV.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T34.W, T15.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T4.Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T36.X, T6.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T33.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T34.Z, T14.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T35.W, T14.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T5.Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T37.X, T6.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T34.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T35.Z, T14.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T36.W, T13.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T5.Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T38.X, T6.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T35.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T36.Z, T12.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T37.W, T13.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T6.Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T39.X, T7.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T36.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T37.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T38.W, T9.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T11.W, T31.Y, literal.x,
+; EG-NEXT: BFE_INT T37.Z, T13.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T38.W, T11.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T6.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T40.X, T21.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T40.X, T7.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T37.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T38.Z, T10.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T39.W, T8.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T10.W, T31.Z, literal.x,
+; EG-NEXT: BFE_INT T38.Z, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T39.W, T12.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T6.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T31.X, T21.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T41.X, T7.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T38.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T39.Z, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: BFE_INT T40.W, T9.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T9.W, T31.W, literal.x,
+; EG-NEXT: BFE_INT T39.Z, T10.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T40.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T7.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T41.X, T21.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T42.X, T8.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T39.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T40.Z, T8.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T31.W, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 151:
-; EG-NEXT: LSHR * T8.W, T21.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T42.X, T21.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T40.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T31.Z, T7.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T41.W, T7.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T7.W, T21.Y, literal.x,
+; EG-NEXT: BFE_INT T40.Z, T11.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T41.W, T9.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T7.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T43.X, T20.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T31.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T41.Z, T6.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.W, T6.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T7.W, T21.Z, literal.x,
+; EG-NEXT: ALU clause starting at 212:
+; EG-NEXT: BFE_INT T43.X, T8.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T40.Y, T5.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T41.Z, T10.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T42.W, T10.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T7.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T21.X, T20.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T44.X, T0.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T41.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.Z, T6.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.W, T5.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T21.W, literal.x,
+; EG-NEXT: BFE_INT T42.Z, T8.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T43.W, T9.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T5.W, T8.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T44.X, T20.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T45.X, T0.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T42.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.Z, T5.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.W, T5.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T5.W, T20.X, literal.x,
+; EG-NEXT: BFE_INT T43.Z, T9.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T44.W, T4.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T4.W, T8.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T45.X, T20.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T46.X, T0.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T43.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.Z, T4.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T44.W, T4.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T5.W, T20.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T46.X, T19.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T44.Z, T4.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T44.Z, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: BFE_INT T45.W, T3.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T4.W, T20.Z, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T20.X, T19.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T47.X, T1.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T44.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.Z, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: BFE_INT T46.W, T3.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T3.W, T20.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T47.X, T19.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T46.Z, T2.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T20.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T2.W, T19.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T48.X, T19.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T46.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T20.Z, T2.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T47.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_INT T45.Z, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T46.W, T2.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 208(2.914701e-43)
-; EG-NEXT: LSHR T19.X, PS, literal.x,
-; EG-NEXT: BFE_INT T20.Y, T1.Z, 0.0, literal.y,
-; EG-NEXT: BFE_INT T47.Z, T1.Y, 0.0, literal.y,
-; EG-NEXT: BFE_INT T48.W, T0.W, 0.0, literal.y,
+; EG-NEXT: LSHR T48.X, PS, literal.x,
+; EG-NEXT: BFE_INT T45.Y, T2.Z, 0.0, literal.y,
+; EG-NEXT: BFE_INT T46.Z, T3.Y, 0.0, literal.y,
+; EG-NEXT: BFE_INT T47.W, T1.W, 0.0, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T49.X, PS, literal.x,
-; EG-NEXT: BFE_INT T47.Y, T0.Z, 0.0, literal.y,
-; EG-NEXT: BFE_INT T48.Z, T0.Y, 0.0, literal.y,
-; EG-NEXT: LSHR T0.W, T19.W, literal.y,
+; EG-NEXT: BFE_INT T46.Y, T1.Z, 0.0, literal.y,
+; EG-NEXT: BFE_INT T47.Z, T2.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T0.W, T1.Y, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T50.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T48.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT * T47.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v64i8_to_v64i32:
@@ -5840,27 +5998,29 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T6.X, 1
+; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T5.X, T4.X, literal.x, PV.W,
-; EG-NEXT: LSHR * T5.Z, T4.X, literal.y,
+; EG-NEXT: BFE_UINT T4.X, PV.Y, literal.x, PV.W,
+; EG-NEXT: LSHR * T4.Z, PV.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: BFE_UINT * T4.Z, T4.X, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
; EG-NEXT: MOV T4.Y, 0.0,
-; EG-NEXT: MOV T5.W, 0.0,
-; EG-NEXT: MOV * T4.W, 0.0,
+; EG-NEXT: BFE_UINT * T5.Z, T0.Y, literal.x, T0.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T5.X, T0.Y, literal.x,
+; EG-NEXT: MOV T5.Y, 0.0,
+; EG-NEXT: MOV T4.W, 0.0,
+; EG-NEXT: MOV * T5.W, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
@@ -6178,41 +6338,45 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 34, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T9.X, 1
+; EG-NEXT: ALU 38, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T11.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T9.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: MOV T3.X, T5.X,
+; EG-NEXT: MOV * T2.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T6.X, T5.Y, literal.x, PV.W,
-; EG-NEXT: LSHR * T6.Z, T5.Y, literal.y,
+; EG-NEXT: BFE_UINT T5.X, PV.Z, literal.x, PV.W,
+; EG-NEXT: LSHR * T5.Z, PV.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T6.Y, 0.0,
-; EG-NEXT: BFE_UINT * T7.Z, T5.Y, literal.x, T0.W,
+; EG-NEXT: MOV T5.Y, 0.0,
+; EG-NEXT: BFE_UINT * T6.Z, T0.Z, literal.x, T0.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T7.X, T5.Y, literal.x,
-; EG-NEXT: MOV * T7.Y, 0.0,
+; EG-NEXT: AND_INT T6.X, T0.Z, literal.x,
+; EG-NEXT: MOV * T6.Y, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T8.X, T5.X, literal.x, T0.W,
-; EG-NEXT: LSHR * T8.Z, T5.X, literal.y,
+; EG-NEXT: BFE_UINT T7.X, T0.Y, literal.x, T0.W,
+; EG-NEXT: LSHR * T7.Z, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T8.Y, 0.0,
-; EG-NEXT: BFE_UINT * T5.Z, T5.X, literal.x, T0.W,
+; EG-NEXT: MOV T7.Y, 0.0,
+; EG-NEXT: BFE_UINT * T8.Z, T0.Y, literal.x, T0.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T5.X, T5.X, literal.x,
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV T6.W, 0.0,
-; EG-NEXT: MOV * T7.W, 0.0,
+; EG-NEXT: AND_INT T8.X, T0.Y, literal.x,
+; EG-NEXT: MOV T8.Y, 0.0,
+; EG-NEXT: MOV T5.W, 0.0,
+; EG-NEXT: MOV * T6.W, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: MOV T8.W, 0.0,
-; EG-NEXT: MOV * T5.W, 0.0,
+; EG-NEXT: MOV T7.W, 0.0,
+; EG-NEXT: MOV * T8.W, 0.0,
; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
@@ -6746,68 +6910,76 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @12
-; EG-NEXT: ALU 68, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T21.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T19.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T18.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T17.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T15.X, 1
+; EG-NEXT: ALU 76, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T22.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T21.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T20.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T17.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T16.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T15.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: MOV * T0.W, literal.x,
+; EG-NEXT: MOV T5.X, T7.X,
+; EG-NEXT: MOV * T4.X, T7.Y,
+; EG-NEXT: MOV T3.X, T7.Z,
+; EG-NEXT: MOV * T2.X, T7.W,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: MOV T0.Z, T4.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: MOV * T1.Y, PS,
+; EG-NEXT: MOV * T1.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T8.X, T7.W, literal.x, PV.W,
-; EG-NEXT: LSHR * T8.Z, T7.W, literal.y,
+; EG-NEXT: BFE_UINT T7.X, T1.Y, literal.x, PV.W,
+; EG-NEXT: LSHR * T7.Z, T1.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T8.Y, 0.0,
-; EG-NEXT: BFE_UINT * T9.Z, T7.W, literal.x, T0.W,
+; EG-NEXT: MOV T7.Y, 0.0,
+; EG-NEXT: BFE_UINT * T8.Z, T1.Y, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T9.X, T7.W, literal.x,
-; EG-NEXT: MOV * T9.Y, 0.0,
+; EG-NEXT: AND_INT T8.X, T1.Y, literal.x,
+; EG-NEXT: MOV * T8.Y, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T10.X, T7.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T10.Z, T7.Z, literal.y,
+; EG-NEXT: BFE_UINT T9.X, T0.W, literal.x, T1.W,
+; EG-NEXT: LSHR * T9.Z, T0.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T10.Y, 0.0,
-; EG-NEXT: BFE_UINT * T11.Z, T7.Z, literal.x, T0.W,
+; EG-NEXT: MOV T9.Y, 0.0,
+; EG-NEXT: BFE_UINT * T10.Z, T0.W, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T11.X, T7.Z, literal.x,
-; EG-NEXT: MOV * T11.Y, 0.0,
+; EG-NEXT: AND_INT T10.X, T0.W, literal.x,
+; EG-NEXT: MOV * T10.Y, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T12.X, T7.Y, literal.x, T0.W,
-; EG-NEXT: LSHR * T12.Z, T7.Y, literal.y,
+; EG-NEXT: BFE_UINT T11.X, T0.Z, literal.x, T1.W,
+; EG-NEXT: LSHR * T11.Z, T0.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T12.Y, 0.0,
-; EG-NEXT: BFE_UINT * T13.Z, T7.Y, literal.x, T0.W,
+; EG-NEXT: MOV T11.Y, 0.0,
+; EG-NEXT: BFE_UINT * T12.Z, T0.Z, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T13.X, T7.Y, literal.x,
-; EG-NEXT: MOV * T13.Y, 0.0,
+; EG-NEXT: AND_INT T12.X, T0.Z, literal.x,
+; EG-NEXT: MOV * T12.Y, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T14.X, T7.X, literal.x, T0.W,
-; EG-NEXT: LSHR * T14.Z, T7.X, literal.y,
+; EG-NEXT: BFE_UINT T13.X, T0.Y, literal.x, T1.W,
+; EG-NEXT: LSHR * T13.Z, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T14.Y, 0.0,
-; EG-NEXT: BFE_UINT * T7.Z, T7.X, literal.x, T0.W,
+; EG-NEXT: MOV T13.Y, 0.0,
+; EG-NEXT: BFE_UINT * T14.Z, T0.Y, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T7.X, T7.X, literal.x,
-; EG-NEXT: MOV T7.Y, 0.0,
-; EG-NEXT: MOV T8.W, 0.0,
-; EG-NEXT: MOV * T9.W, 0.0,
+; EG-NEXT: AND_INT T14.X, T0.Y, literal.x,
+; EG-NEXT: MOV T14.Y, 0.0,
+; EG-NEXT: MOV T7.W, 0.0,
+; EG-NEXT: MOV * T8.W, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: MOV T10.W, 0.0,
-; EG-NEXT: MOV * T11.W, 0.0,
-; EG-NEXT: MOV T12.W, 0.0,
-; EG-NEXT: MOV * T13.W, 0.0,
-; EG-NEXT: MOV T14.W, 0.0,
-; EG-NEXT: MOV * T7.W, 0.0,
-; EG-NEXT: LSHR T15.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T9.W, 0.0,
+; EG-NEXT: MOV * T10.W, 0.0,
+; EG-NEXT: MOV T11.W, 0.0,
+; EG-NEXT: MOV * T12.W, 0.0,
+; EG-NEXT: MOV T13.W, 0.0,
+; EG-NEXT: MOV * T14.W, 0.0,
+; EG-NEXT: LSHR T15.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T16.X, PV.W, literal.x,
@@ -7771,141 +7943,162 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
;
; EG-LABEL: constant_zextload_v32i8_to_v32i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @22
-; EG-NEXT: ALU 103, @27, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 33, @131, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T39.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T38.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T37.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T36.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T35.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T34.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T33.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T32.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T31.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T30.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T29.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T28.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T27.X, 1
+; EG-NEXT: ALU 0, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @24
+; EG-NEXT: ALU 3, @29, KC0[], KC1[]
+; EG-NEXT: TEX 0 @26
+; EG-NEXT: ALU 107, @33, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 42, @141, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T42.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T41.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T40.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T39.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T38.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T37.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T36.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T35.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T34.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T33.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T32.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T31.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T30.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T29.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T28.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T27.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 22:
+; EG-NEXT: Fetch clause starting at 24:
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; EG-NEXT: Fetch clause starting at 26:
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
-; EG-NEXT: ALU clause starting at 26:
+; EG-NEXT: ALU clause starting at 28:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 27:
-; EG-NEXT: MOV * T0.W, literal.x,
+; EG-NEXT: ALU clause starting at 29:
+; EG-NEXT: MOV T9.X, T12.X,
+; EG-NEXT: MOV * T8.X, T12.Y,
+; EG-NEXT: MOV T7.X, T12.Z,
+; EG-NEXT: MOV * T6.X, T12.W,
+; EG-NEXT: ALU clause starting at 33:
+; EG-NEXT: MOV T5.X, T11.X,
+; EG-NEXT: MOV * T4.X, T11.Y,
+; EG-NEXT: MOV T3.X, T11.Z,
+; EG-NEXT: MOV * T2.X, T11.W,
+; EG-NEXT: MOV T0.Y, T6.X,
+; EG-NEXT: MOV T0.Z, T5.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T0.W, T4.X, BS:VEC_201
+; EG-NEXT: MOV * T1.Y, PV.X,
+; EG-NEXT: MOV T1.Z, T2.X,
+; EG-NEXT: MOV * T1.W, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T11.X, PV.Z, literal.x, PV.W,
+; EG-NEXT: LSHR * T11.Z, PV.Z, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: MOV T11.Y, 0.0,
+; EG-NEXT: BFE_UINT * T12.Z, T1.Z, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T13.X, T11.W, literal.x, PV.W,
-; EG-NEXT: LSHR * T13.Z, T11.W, literal.y,
+; EG-NEXT: AND_INT T12.X, T1.Z, literal.x,
+; EG-NEXT: MOV * T12.Y, 0.0,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T13.X, T1.Y, literal.x, T1.W,
+; EG-NEXT: LSHR * T13.Z, T1.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
; EG-NEXT: MOV T13.Y, 0.0,
-; EG-NEXT: BFE_UINT * T14.Z, T11.W, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T14.Z, T1.Y, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T14.X, T11.W, literal.x,
+; EG-NEXT: AND_INT T14.X, T1.Y, literal.x,
; EG-NEXT: MOV * T14.Y, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T15.X, T11.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T15.Z, T11.Z, literal.y,
+; EG-NEXT: BFE_UINT T15.X, T0.W, literal.x, T1.W,
+; EG-NEXT: LSHR * T15.Z, T0.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
; EG-NEXT: MOV T15.Y, 0.0,
-; EG-NEXT: BFE_UINT * T16.Z, T11.Z, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T16.Z, T0.W, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T16.X, T11.Z, literal.x,
+; EG-NEXT: AND_INT T16.X, T0.W, literal.x,
; EG-NEXT: MOV * T16.Y, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T17.X, T11.Y, literal.x, T0.W,
-; EG-NEXT: LSHR * T17.Z, T11.Y, literal.y,
+; EG-NEXT: BFE_UINT T17.X, T0.Z, literal.x, T1.W,
+; EG-NEXT: LSHR * T17.Z, T0.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
; EG-NEXT: MOV T17.Y, 0.0,
-; EG-NEXT: BFE_UINT * T18.Z, T11.Y, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T18.Z, T0.Z, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T18.X, T11.Y, literal.x,
+; EG-NEXT: AND_INT T18.X, T0.Z, literal.x,
; EG-NEXT: MOV * T18.Y, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T19.X, T11.X, literal.x, T0.W,
-; EG-NEXT: LSHR * T19.Z, T11.X, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T19.Y, 0.0,
-; EG-NEXT: BFE_UINT * T11.Z, T11.X, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T19.X, T0.Y, literal.x, T1.W,
+; EG-NEXT: MOV T0.Z, T9.X,
+; EG-NEXT: MOV * T0.W, T8.X, BS:VEC_120/SCL_212
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: MOV T1.Y, T7.X,
+; EG-NEXT: LSHR T19.Z, T0.Y, literal.x,
+; EG-NEXT: MOV * T19.Y, 0.0,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT * T20.Z, T0.Y, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T11.X, T11.X, literal.x,
-; EG-NEXT: MOV * T11.Y, 0.0,
+; EG-NEXT: AND_INT T20.X, T0.Y, literal.x,
+; EG-NEXT: MOV * T20.Y, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T20.X, T12.W, literal.x, T0.W,
-; EG-NEXT: LSHR * T20.Z, T12.W, literal.y,
+; EG-NEXT: BFE_UINT T21.X, T1.Y, literal.x, T1.W,
+; EG-NEXT: LSHR * T21.Z, T1.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T20.Y, 0.0,
-; EG-NEXT: BFE_UINT * T21.Z, T12.W, literal.x, T0.W,
+; EG-NEXT: MOV T21.Y, 0.0,
+; EG-NEXT: BFE_UINT * T22.Z, T1.Y, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T21.X, T12.W, literal.x,
-; EG-NEXT: MOV * T21.Y, 0.0,
+; EG-NEXT: AND_INT T22.X, T1.Y, literal.x,
+; EG-NEXT: MOV * T22.Y, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T22.X, T12.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T22.Z, T12.Z, literal.y,
+; EG-NEXT: BFE_UINT T23.X, T0.W, literal.x, T1.W,
+; EG-NEXT: LSHR * T23.Z, T0.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T22.Y, 0.0,
-; EG-NEXT: BFE_UINT * T23.Z, T12.Z, literal.x, T0.W,
+; EG-NEXT: MOV T23.Y, 0.0,
+; EG-NEXT: BFE_UINT * T24.Z, T0.W, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T23.X, T12.Z, literal.x,
-; EG-NEXT: MOV * T23.Y, 0.0,
+; EG-NEXT: AND_INT T24.X, T0.W, literal.x,
+; EG-NEXT: MOV * T24.Y, 0.0,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T24.X, T12.Y, literal.x, T0.W,
-; EG-NEXT: LSHR * T24.Z, T12.Y, literal.y,
+; EG-NEXT: BFE_UINT T25.X, T0.Z, literal.x, T1.W,
+; EG-NEXT: LSHR * T25.Z, T0.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T24.Y, 0.0,
-; EG-NEXT: BFE_UINT * T25.Z, T12.Y, literal.x, T0.W,
+; EG-NEXT: MOV T25.Y, 0.0,
+; EG-NEXT: BFE_UINT * T26.Z, T0.Z, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T25.X, T12.Y, literal.x,
-; EG-NEXT: MOV * T25.Y, 0.0,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T26.X, T12.X, literal.x, T0.W,
-; EG-NEXT: LSHR * T26.Z, T12.X, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T26.X, T0.Z, literal.x,
; EG-NEXT: MOV T26.Y, 0.0,
-; EG-NEXT: BFE_UINT * T12.Z, T12.X, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T12.X, T12.X, literal.x,
-; EG-NEXT: MOV T12.Y, 0.0,
+; EG-NEXT: MOV T11.W, 0.0,
+; EG-NEXT: MOV * T12.W, 0.0,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T13.W, 0.0,
; EG-NEXT: MOV * T14.W, 0.0,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T15.W, 0.0,
; EG-NEXT: MOV * T16.W, 0.0,
; EG-NEXT: MOV T17.W, 0.0,
; EG-NEXT: MOV * T18.W, 0.0,
; EG-NEXT: MOV T19.W, 0.0,
-; EG-NEXT: MOV * T11.W, 0.0,
-; EG-NEXT: MOV T20.W, 0.0,
-; EG-NEXT: MOV * T21.W, 0.0,
-; EG-NEXT: MOV T22.W, 0.0,
-; EG-NEXT: MOV * T23.W, 0.0,
-; EG-NEXT: MOV T24.W, 0.0,
-; EG-NEXT: MOV * T25.W, 0.0,
-; EG-NEXT: MOV T26.W, 0.0,
-; EG-NEXT: MOV * T12.W, 0.0,
+; EG-NEXT: MOV * T20.W, 0.0,
+; EG-NEXT: MOV T21.W, 0.0,
+; EG-NEXT: MOV * T22.W, 0.0,
+; EG-NEXT: MOV T23.W, 0.0,
+; EG-NEXT: MOV * T24.W, 0.0,
+; EG-NEXT: MOV T25.W, 0.0,
+; EG-NEXT: MOV * T26.W, 0.0,
; EG-NEXT: LSHR T27.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T28.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT: LSHR * T28.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 141:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T29.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR T30.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR * T31.X, PV.W, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 131:
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T31.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T32.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
@@ -9300,9 +9493,10 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX6-NOHSA-NEXT: s_mov_b32 s4, s0
; GFX6-NOHSA-NEXT: s_mov_b32 s5, s1
; GFX6-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; GFX6-NOHSA-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GFX6-NOHSA-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX6-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NOHSA-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v0, 0xff00ff, v0
; GFX6-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NOHSA-NEXT: s_endpgm
;
@@ -9316,9 +9510,10 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
; GFX7-HSA-NEXT: s_waitcnt vmcnt(0)
-; GFX7-HSA-NEXT: v_lshlrev_b32_e32 v3, 8, v2
+; GFX7-HSA-NEXT: v_lshrrev_b32_e32 v3, 8, v2
+; GFX7-HSA-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-HSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-HSA-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-HSA-NEXT: v_and_b32_e32 v2, 0xff00ff, v2
; GFX7-HSA-NEXT: flat_store_dword v[0:1], v2
; GFX7-HSA-NEXT: s_endpgm
;
@@ -9342,22 +9537,31 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; EG-LABEL: constant_zextload_v2i8_to_v2i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T5.X, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: MOV * T5.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T5.X, literal.x,
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: AND_INT T0.W, T5.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), -65536(nan)
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
+; EG-NEXT: MOV * T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV * T1.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T5.X, literal.y,
-; EG-NEXT: 16711680(2.341805e-38), 255(3.573311e-43)
+; EG-NEXT: BFE_UINT * T1.W, PV.Y, literal.x, PV.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.W, literal.y,
+; EG-NEXT: 16(2.242078e-44), 255(3.573311e-43)
; EG-NEXT: OR_INT T5.X, PS, PV.W,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -9506,12 +9710,12 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_and_b32 s5, s4, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s5, s4, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s4, 24
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s6, v0, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s5, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s5, 16
; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s5
; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
@@ -9526,11 +9730,11 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s0, s2, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 16
; GFX7-HSA-NEXT: v_alignbit_b32 v2, s1, v2, 16
; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0
; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2
@@ -9559,45 +9763,48 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; EG-LABEL: constant_zextload_v4i8_to_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 31, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 35, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T4.X,
; EG-NEXT: MOV * T7.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: AND_INT T0.W, T7.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: MOV * T2.X, T7.X,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T7.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
+; EG-NEXT: MOV * T0.Z, T4.X,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: MOV * T4.X, PV.W,
+; EG-NEXT: MOV T0.Z, PV.X,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T0.W, T7.X, literal.x, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
+; EG-NEXT: BFE_UINT T1.W, T0.Y, literal.x, PV.W,
+; EG-NEXT: AND_INT * T2.W, PV.Z, literal.y,
+; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
+; EG-NEXT: MOV T4.X, PV.W,
+; EG-NEXT: MOV T0.Z, T5.X,
+; EG-NEXT: BFE_UINT * T0.W, T0.Y, literal.x, T0.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
+; EG-NEXT: -65536(nan), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
+; EG-NEXT: MOV T0.Z, PV.X,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Z, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
; EG-NEXT: OR_INT * T8.Y, PV.W, PS,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -9706,7 +9913,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 39, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -9726,9 +9933,11 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -9791,18 +10000,18 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s4, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s4, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s8, s5, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s8, s5, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5
; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s9, v0, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s6, s6, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s6, s6, 16
; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s8
; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s6
@@ -9823,16 +10032,16 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s5, v0, 16
-; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s0, s2, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24
-; GFX7-HSA-NEXT: s_and_b32 s4, s3, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s4, s3, 0x80008
; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s4, s4, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s4, s4, 16
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s1, v0, 16
; GFX7-HSA-NEXT: s_and_b32 s1, s2, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 16
; GFX7-HSA-NEXT: s_or_b32 s3, s3, s4
; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0
; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
@@ -9871,80 +10080,84 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; EG-LABEL: constant_zextload_v8i8_to_v8i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 61, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1
+; EG-NEXT: ALU 66, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T12.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T8.X,
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: AND_INT T0.W, T11.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T11.X, literal.x,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: MOV T2.X, T11.Y,
+; EG-NEXT: MOV * T6.X, T11.X,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: MOV * T0.Z, T8.X,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV * T0.Z, T2.X,
+; EG-NEXT: MOV * T8.X, T0.W,
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: MOV * T1.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: BFE_UINT T2.W, T0.Y, literal.x, PS,
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: LSHL * T2.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T0.W, PV.W,
; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, T11.X, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
+; EG-NEXT: MOV * T0.W, T9.X,
+; EG-NEXT: BFE_UINT T2.W, T0.Y, literal.x, T1.W,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 16(2.242078e-44), -65536(nan)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T11.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T12.Y, PV.W, PS,
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: LSHR * T2.W, T0.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T2.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T11.Y, PV.W, PS,
; EG-NEXT: MOV T9.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T11.Y, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T2.W, T0.Z, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T4.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T11.Y, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T0.Z, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: BFE_UINT * T0.W, T11.Y, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T0.W, T0.Z, literal.x, T1.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T5.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T12.W, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: LSHR T12.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T11.W, PV.W, PS,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T12.X, T8.X,
-; EG-NEXT: MOV * T12.Z, T4.X,
+; EG-NEXT: MOV * T11.X, T8.X,
+; EG-NEXT: MOV * T11.Z, T4.X,
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
@@ -10091,7 +10304,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 74, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 78, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -10111,9 +10324,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -10147,9 +10362,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -10220,13 +10437,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_and_b32 s8, s6, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s8, s6, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s6, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s10, s7, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s7, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s12, s4, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s4, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s4, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s14, s5, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s5, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5
; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff
@@ -10237,13 +10454,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff
; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s15, v0, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s11, v2, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s10, s10, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s10, s10, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s9, v3, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 16
; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s14
; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
@@ -10277,22 +10494,22 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s9, v0, 16
-; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s2, s6, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s3, s6, 24
-; GFX7-HSA-NEXT: s_and_b32 s8, s7, 0xff00
-; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00
-; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s8, s7, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s10, s4, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s12, s5, 0x80008
; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 16
; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 16
; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 16
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s3, v0, 16
; GFX7-HSA-NEXT: s_and_b32 s3, s6, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s2, s2, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s2, s2, 16
; GFX7-HSA-NEXT: s_or_b32 s5, s5, s12
; GFX7-HSA-NEXT: s_or_b32 s4, s4, s10
; GFX7-HSA-NEXT: s_or_b32 s7, s7, s8
@@ -10319,193 +10536,201 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s4
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80008
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s7, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s4, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s5, 0x80008
+; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s5, 24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s3, v1, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s3, v0
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24
-; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s7, 0x80010
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
-; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8
-; GFX8-NOHSA-NEXT: s_or_b32 s3, s4, s3
-; GFX8-NOHSA-NEXT: s_and_b32 s4, s7, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6
; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24
-; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000
+; GFX8-NOHSA-NEXT: s_bfe_u32 s12, s5, 0x80010
+; GFX8-NOHSA-NEXT: s_lshl_b32 s11, s11, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s10, s10, 16
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s4, v0
+; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s8, 16
+; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s7, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
-; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8
-; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5
-; GFX8-NOHSA-NEXT: s_or_b32 s4, s4, s7
+; GFX8-NOHSA-NEXT: s_or_b32 s11, s12, s11
+; GFX8-NOHSA-NEXT: s_or_b32 s5, s5, s10
+; GFX8-NOHSA-NEXT: s_or_b32 s4, s8, s4
+; GFX8-NOHSA-NEXT: s_or_b32 s3, s7, s3
; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s2, v3, 16
; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s9, v1, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i8_to_v16i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 103, @12, KC0[], KC1[]
-; EG-NEXT: ALU 20, @116, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT: ALU 104, @11, KC0[], KC1[]
+; EG-NEXT: ALU 28, @116, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T0.Y, T16.X,
; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: AND_INT T0.W, T19.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T19.X, literal.x,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: MOV T2.X, T19.W,
+; EG-NEXT: MOV * T6.X, T19.Z,
+; EG-NEXT: MOV T10.X, T19.Y,
+; EG-NEXT: MOV * T14.X, T19.X,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: MOV * T0.Z, T16.X,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV * T0.Z, T10.X,
+; EG-NEXT: MOV T1.Y, T6.X,
+; EG-NEXT: MOV * T1.Z, T2.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T16.X, T0.W,
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: MOV * T1.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: BFE_UINT T2.W, T0.Y, literal.x, PS,
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: LSHL * T2.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T0.W, PV.W,
; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, T17.X,
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, T19.X, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
+; EG-NEXT: MOV * T0.W, T17.X,
+; EG-NEXT: BFE_UINT T2.W, T0.Y, literal.x, T1.W,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 16(2.242078e-44), -65536(nan)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T19.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T20.Y, PV.W, PS,
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: LSHR * T2.W, T0.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T2.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T19.Y, PV.W, PS,
; EG-NEXT: MOV T17.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T12.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T19.Y, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T2.W, T0.Z, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T12.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T19.Y, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T0.Z, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T12.X, PV.W,
; EG-NEXT: MOV T0.Y, T13.X,
-; EG-NEXT: BFE_UINT * T1.W, T19.Y, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T0.W, T0.Z, literal.x, T1.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T13.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T19.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T20.W, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T19.W, PV.W, PS,
; EG-NEXT: MOV T13.X, PV.W,
; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T19.Z, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T2.W, T1.Y, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T8.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T19.Z, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T1.Y, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T8.X, PV.W,
; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: BFE_UINT * T1.W, T19.Z, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T0.W, T1.Y, literal.x, T1.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T9.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T19.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, T1.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T19.Y, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T20.Y, PV.W, PS,
; EG-NEXT: MOV T9.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T19.W, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T2.W, T1.Z, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T4.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T19.W, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T1.Z, literal.x, T1.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 116:
+; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T0.W, T1.Z, literal.x, T1.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 116:
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T5.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR T0.W, T19.W, literal.x,
+; EG-NEXT: LSHR T0.W, T1.Z, literal.x,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
+; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
; EG-NEXT: LSHR T21.X, PS, literal.x,
; EG-NEXT: AND_INT T1.W, PV.Y, literal.y,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.z,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT: 16711680(2.341805e-38), 0(0.000000e+00)
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T19.W, PV.W, PS,
+; EG-NEXT: OR_INT * T20.W, PV.W, PS,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T20.X, T16.X,
-; EG-NEXT: MOV * T20.Z, T12.X,
-; EG-NEXT: MOV T19.X, T8.X,
-; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T19.X, T16.X,
+; EG-NEXT: MOV * T19.Z, T12.X,
+; EG-NEXT: MOV T20.X, T8.X,
+; EG-NEXT: MOV * T20.Z, T4.X, BS:VEC_120/SCL_212
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
@@ -10753,7 +10978,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 104, @12, KC0[], KC1[]
-; EG-NEXT: ALU 46, @117, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 54, @117, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
; EG-NEXT: CF_END
@@ -10774,9 +10999,11 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T19.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -10810,9 +11037,11 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T19.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -10846,9 +11075,11 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T19.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -10863,14 +11094,14 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: ALU clause starting at 117:
+; EG-NEXT: MOV T0.Y, T9.X,
; EG-NEXT: ASHR * T0.W, T19.Z, literal.x,
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: ALU clause starting at 117:
-; EG-NEXT: OR_INT * T19.Y, T1.W, T0.W,
+; EG-NEXT: OR_INT * T19.Y, PV.W, PS,
; EG-NEXT: MOV T9.X, PV.Y,
; EG-NEXT: MOV T0.Y, T4.X,
; EG-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x,
@@ -10883,9 +11114,11 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T19.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -10982,21 +11215,21 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000
; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_and_b32 s12, s6, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s6, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s6, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s7, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s16, s4, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s4, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s18, s5, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s5, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s20, s2, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s20, s2, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s2, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s22, s3, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s22, s3, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s3, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s24, s0, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s24, s0, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s0, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s26, s1, 0xff00
+; GFX6-NOHSA-NEXT: s_bfe_u32 s26, s1, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s1, 24
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1
; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff
@@ -11015,21 +11248,21 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s6
; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff
; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s27, v0, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s25, v1, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s23, v2, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s22, s22, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s22, s22, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v8, s21, v3, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s19, v4, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v9, s17, v5, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v6, s15, v6, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 16
; GFX6-NOHSA-NEXT: v_alignbit_b32 v10, s13, v7, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8
+; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 16
; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s26
; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
@@ -11078,55 +11311,55 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s21, v0, 16
; GFX7-HSA-NEXT: s_lshr_b32 s19, s2, 24
-; GFX7-HSA-NEXT: s_and_b32 s24, s1, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s24, s1, 0x80008
; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-HSA-NEXT: s_and_b32 s22, s0, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s22, s0, 0x80008
; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s24, s24, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s24, s24, 16
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s19, v0, 16
; GFX7-HSA-NEXT: s_lshr_b32 s17, s5, 24
-; GFX7-HSA-NEXT: s_and_b32 s20, s3, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s20, s3, 0x80008
; GFX7-HSA-NEXT: s_or_b32 s24, s1, s24
; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s22, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s22, 16
; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
-; GFX7-HSA-NEXT: s_and_b32 s18, s2, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s18, s2, 0x80008
; GFX7-HSA-NEXT: s_or_b32 s22, s0, s1
; GFX7-HSA-NEXT: s_and_b32 s0, s3, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 16
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s17, v0, 16
; GFX7-HSA-NEXT: s_lshr_b32 s15, s4, 24
-; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s16, s5, 0x80008
; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1
; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s18, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s18, 16
; GFX7-HSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s14, s4, 0x80008
; GFX7-HSA-NEXT: s_or_b32 s2, s0, s1
; GFX7-HSA-NEXT: s_and_b32 s0, s5, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 16
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s15, v0, 16
-; GFX7-HSA-NEXT: s_and_b32 s12, s7, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s12, s7, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s13, s7, 24
; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1
; GFX7-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v0
; GFX7-HSA-NEXT: s_and_b32 s0, s4, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00
+; GFX7-HSA-NEXT: s_bfe_u32 s10, s6, 0x80008
; GFX7-HSA-NEXT: s_or_b32 s4, s0, s1
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16
; GFX7-HSA-NEXT: s_and_b32 s0, s7, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 16
; GFX7-HSA-NEXT: s_lshr_b32 s11, s6, 24
; GFX7-HSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v0
; GFX7-HSA-NEXT: s_or_b32 s0, s0, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
; GFX7-HSA-NEXT: s_and_b32 s1, s6, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s6, s10, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s6, s10, 16
; GFX7-HSA-NEXT: s_or_b32 s1, s1, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48
@@ -11164,57 +11397,56 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s1, 24
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
-; GFX8-NOHSA-NEXT: s_lshl_b32 s14, s14, 16
-; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s1, 0x80010
-; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24
-; GFX8-NOHSA-NEXT: s_or_b32 s14, s15, s14
-; GFX8-NOHSA-NEXT: s_and_b32 s15, s1, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8
+; GFX8-NOHSA-NEXT: s_bfe_u32 s20, s1, 0x80008
+; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s3, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s0, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s21, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s22, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s20, s20, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
+; GFX8-NOHSA-NEXT: s_bfe_u32 s17, s3, 0x80008
+; GFX8-NOHSA-NEXT: s_or_b32 s20, s1, s20
; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24
-; GFX8-NOHSA-NEXT: s_or_b32 s15, s15, s1
-; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s18, 16
; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16
-; GFX8-NOHSA-NEXT: s_or_b32 s13, s1, s0
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s3, 8
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2
+; GFX8-NOHSA-NEXT: s_or_b32 s18, s1, s0
; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
+; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s17, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s15, s5, 24
; GFX8-NOHSA-NEXT: s_or_b32 s3, s0, s1
; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s5, 0x80008
; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s0, v2
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s5, 24
-; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s15, 16
; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s4
+; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s2, 24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: s_or_b32 s2, s1, s0
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s5, 8
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s4
; GFX8-NOHSA-NEXT: s_and_b32 s0, s5, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
+; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s14, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s7, 24
; GFX8-NOHSA-NEXT: s_or_b32 s5, s0, s1
; GFX8-NOHSA-NEXT: s_and_b32 s0, s4, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s7, 0x80008
; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s0, v4
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s7, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s12, 16
; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s7, 0x80010
-; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s7, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s4, 24
+; GFX8-NOHSA-NEXT: s_lshl_b32 s21, s21, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
; GFX8-NOHSA-NEXT: s_and_b32 s1, s7, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000
+; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s11, 16
; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24
+; GFX8-NOHSA-NEXT: s_or_b32 s21, s22, s21
; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s4
; GFX8-NOHSA-NEXT: s_and_b32 s4, s6, 0xff
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0
@@ -11232,7 +11464,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s4, v6
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v5, s11, v5, 16
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v5, s13, v5, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16
@@ -11240,262 +11472,283 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s12, v3, 16
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s16, v3, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s19, v1, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v32i8_to_v32i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @10
-; EG-NEXT: ALU 103, @16, KC0[], KC1[]
-; EG-NEXT: ALU 104, @120, KC0[], KC1[]
-; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @12
+; EG-NEXT: ALU 3, @17, KC0[], KC1[]
+; EG-NEXT: TEX 0 @14
+; EG-NEXT: ALU 104, @21, KC0[], KC1[]
+; EG-NEXT: ALU 105, @126, KC0[], KC1[]
+; EG-NEXT: ALU 52, @232, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T42.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T41.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T40.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T39.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 10:
-; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1
-; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1
-; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: MOV * T0.Y, T16.X,
-; EG-NEXT: MOV * T35.X, KC0[2].Z,
+; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1
+; EG-NEXT: Fetch clause starting at 14:
+; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 16, #1
; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: AND_INT T0.W, T37.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T37.X, literal.x,
+; EG-NEXT: MOV * T35.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 17:
+; EG-NEXT: MOV T18.X, T36.W,
+; EG-NEXT: MOV * T22.X, T36.Z,
+; EG-NEXT: MOV T26.X, T36.Y,
+; EG-NEXT: MOV * T30.X, T36.X,
+; EG-NEXT: ALU clause starting at 21:
+; EG-NEXT: MOV T2.X, T35.W,
+; EG-NEXT: MOV * T6.X, T35.Z,
+; EG-NEXT: MOV T10.X, T35.Y,
+; EG-NEXT: MOV * T14.X, T35.X,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: MOV * T0.Z, T16.X,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV * T0.Z, T10.X,
+; EG-NEXT: MOV T1.Y, T6.X,
+; EG-NEXT: MOV T1.Z, T2.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T30.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T26.X,
+; EG-NEXT: MOV T2.Z, T22.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T18.X, BS:VEC_201
+; EG-NEXT: MOV * T16.X, T0.W,
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: MOV * T3.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: BFE_UINT T4.W, T0.Y, literal.x, PS,
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: LSHL * T4.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T0.W, PV.W,
; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, T17.X,
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
+; EG-NEXT: MOV * T0.W, T17.X,
+; EG-NEXT: BFE_UINT T4.W, T0.Y, literal.x, T3.W,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 16(2.242078e-44), -65536(nan)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T36.Y, PV.W, PS,
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: LSHR * T4.W, T0.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T4.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T35.Y, PV.W, PS,
; EG-NEXT: MOV T17.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T12.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T4.W, T0.Z, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T12.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T37.Y, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T0.Z, literal.x, T3.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: AND_INT T4.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T12.X, PV.W,
; EG-NEXT: MOV T0.Y, T13.X,
-; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T0.W, T0.Z, literal.x, T3.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T4.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T13.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T36.W, PV.W, PS,
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T4.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T35.W, PV.W, PS,
; EG-NEXT: MOV T13.X, PV.W,
; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T4.W, T1.Y, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T8.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T37.Z, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T1.Y, literal.x, T3.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: AND_INT T4.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T8.X, PV.W,
; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T0.W, T1.Y, literal.x, T3.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T4.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T9.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T37.Y, PV.W, PS,
+; EG-NEXT: LSHR * T0.W, T1.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T4.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T36.Y, PV.W, PS,
; EG-NEXT: MOV T9.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T37.W, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T4.W, T1.Z, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: ALU clause starting at 126:
+; EG-NEXT: MOV * T4.X, T0.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T37.W, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T1.Z, literal.x, T3.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: AND_INT T4.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T0.W, T1.Z, literal.x, T3.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 120:
-; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x,
+; EG-NEXT: AND_INT * T4.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T5.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T37.W, PV.W, PS,
+; EG-NEXT: LSHR * T0.W, T1.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T4.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T36.W, PV.W, PS,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T32.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.X, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T4.W, T1.W, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T32.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T35.X, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T1.W, literal.x, T3.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: AND_INT T4.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T32.X, PV.W,
; EG-NEXT: MOV T0.Y, T33.X,
-; EG-NEXT: BFE_UINT * T1.W, T35.X, literal.x, T0.W, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_UINT * T0.W, T1.W, literal.x, T3.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T4.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T33.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T35.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T38.Y, PV.W, PS,
+; EG-NEXT: LSHR * T0.W, T1.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T37.Y, PV.W, PS,
; EG-NEXT: MOV T33.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T28.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.Y, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T2.Y, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T28.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T35.Y, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T2.Y, literal.x, T3.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T28.X, PV.W,
; EG-NEXT: MOV T0.Y, T29.X,
-; EG-NEXT: BFE_UINT * T1.W, T35.Y, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T0.W, T2.Y, literal.x, T3.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T29.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T35.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T38.W, PV.W, PS,
+; EG-NEXT: LSHR * T0.W, T2.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T37.W, PV.W, PS,
; EG-NEXT: MOV T29.X, PV.W,
; EG-NEXT: MOV * T0.Y, T24.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.Z, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T2.Z, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T24.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T35.Z, literal.x,
+; EG-NEXT: BFE_UINT * T0.W, T2.Z, literal.x, T3.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T24.X, PV.W,
; EG-NEXT: MOV T0.Y, T25.X,
-; EG-NEXT: BFE_UINT * T1.W, T35.Z, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T0.W, T2.Z, literal.x, T3.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
; EG-NEXT: MOV * T25.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T35.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T35.Y, PV.W, PS,
+; EG-NEXT: LSHR * T0.W, T2.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 232:
+; EG-NEXT: AND_INT T1.W, T0.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T38.Y, PV.W, PS,
; EG-NEXT: MOV T25.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T20.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.W, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T2.W, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T20.X, PV.W,
-; EG-NEXT: ALU clause starting at 225:
-; EG-NEXT: MOV T0.Y, T20.X,
-; EG-NEXT: LSHL * T1.W, T35.W, literal.x,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: BFE_UINT * T0.W, T2.W, literal.x, T3.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T20.X, PV.W,
; EG-NEXT: MOV T0.Y, T21.X,
-; EG-NEXT: BFE_UINT * T0.W, T35.W, literal.x, T0.W,
+; EG-NEXT: BFE_UINT * T0.W, T2.W, literal.x, T3.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
@@ -11507,27 +11760,27 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: LSHR T39.X, PV.W, literal.x,
; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T0.W, T35.W, literal.x,
+; EG-NEXT: LSHR T0.W, T2.W, literal.x,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
+; EG-NEXT: 24(3.363116e-44), 48(6.726233e-44)
; EG-NEXT: LSHR T41.X, PS, literal.x,
; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y,
-; EG-NEXT: AND_INT T0.W, PV.W, literal.z,
+; EG-NEXT: LSHL T0.W, PV.W, literal.z,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT: 16711680(2.341805e-38), 32(4.484155e-44)
+; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
; EG-NEXT: LSHR T42.X, PS, literal.x,
-; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W,
+; EG-NEXT: OR_INT * T38.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T21.X, PV.W,
-; EG-NEXT: MOV * T36.X, T16.X,
-; EG-NEXT: MOV * T36.Z, T12.X,
-; EG-NEXT: MOV T37.X, T8.X,
-; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212
-; EG-NEXT: MOV * T38.X, T32.X,
-; EG-NEXT: MOV * T38.Z, T28.X,
-; EG-NEXT: MOV T35.X, T24.X,
-; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T35.X, T16.X,
+; EG-NEXT: MOV * T35.Z, T12.X,
+; EG-NEXT: MOV T36.X, T8.X,
+; EG-NEXT: MOV T36.Z, T4.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T37.X, T32.X,
+; EG-NEXT: MOV * T37.Z, T28.X,
+; EG-NEXT: MOV T38.X, T24.X,
+; EG-NEXT: MOV * T38.Z, T20.X, BS:VEC_120/SCL_212
;
; GFX12-LABEL: constant_zextload_v32i8_to_v32i16:
; GFX12: ; %bb.0:
@@ -11977,23 +12230,25 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
;
; EG-LABEL: constant_sextload_v32i8_to_v32i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @10
-; EG-NEXT: ALU 104, @16, KC0[], KC1[]
-; EG-NEXT: ALU 104, @121, KC0[], KC1[]
-; EG-NEXT: ALU 95, @226, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 1, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @12
+; EG-NEXT: ALU 104, @18, KC0[], KC1[]
+; EG-NEXT: ALU 104, @123, KC0[], KC1[]
+; EG-NEXT: ALU 102, @228, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 8, @331, KC0[], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 10:
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1
; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1
-; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: ALU clause starting at 16:
; EG-NEXT: MOV * T0.Y, T16.X,
; EG-NEXT: MOV * T35.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 16:
+; EG-NEXT: ALU clause starting at 18:
; EG-NEXT: BFE_INT * T0.W, T37.X, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -12004,9 +12259,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T37.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -12040,9 +12297,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T37.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -12076,9 +12335,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T37.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -12093,14 +12354,14 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: ALU clause starting at 123:
+; EG-NEXT: MOV T0.Y, T9.X,
; EG-NEXT: ASHR * T0.W, T37.Z, literal.x,
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: ALU clause starting at 121:
-; EG-NEXT: OR_INT * T37.Y, T1.W, T0.W,
+; EG-NEXT: OR_INT * T37.Y, PV.W, PS,
; EG-NEXT: MOV T9.X, PV.Y,
; EG-NEXT: MOV T0.Y, T4.X,
; EG-NEXT: BFE_INT * T0.W, T37.W, 0.0, literal.x,
@@ -12113,9 +12374,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T37.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -12149,9 +12412,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T35.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -12185,18 +12450,21 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T35.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T28.X, PV.W,
-; EG-NEXT: MOV T0.Y, T29.X,
+; EG-NEXT: MOV * T0.Y, T29.X,
+; EG-NEXT: ALU clause starting at 228:
; EG-NEXT: LSHR * T0.W, T35.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 8(1.121039e-44), -65536(nan)
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
@@ -12205,9 +12473,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: ASHR * T0.W, T35.Y, literal.x,
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 226:
-; EG-NEXT: AND_INT T1.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, T0.W, literal.y,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T38.W, PV.W, PS,
; EG-NEXT: MOV T29.X, PV.W,
@@ -12222,9 +12489,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T35.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -12258,9 +12527,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T35.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
@@ -12293,7 +12564,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: LSHR T42.X, PS, literal.x,
; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T21.X, PV.W,
+; EG-NEXT: ALU clause starting at 331:
+; EG-NEXT: MOV T21.X, T35.W,
; EG-NEXT: MOV * T36.X, T16.X,
; EG-NEXT: MOV * T36.Z, T12.X,
; EG-NEXT: MOV T37.X, T8.X,
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 21e27bfa75531d..370fd5b97a2349 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1205,7 +1205,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -1214,9 +1214,11 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T4.Y, T4.X, literal.x,
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: LSHR * T4.Y, PV.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
+; EG-NEXT: AND_INT T4.X, T0.Y, literal.x,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
@@ -1224,7 +1226,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
-; CM-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
@@ -1233,9 +1235,11 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T4.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: LSHR * T4.Y, T4.X, literal.x,
+; CM-NEXT: MOV * T2.X, T4.X,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: LSHR * T4.Y, PV.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT * T4.X, T4.X, literal.x,
+; CM-NEXT: AND_INT * T4.X, T0.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -1304,8 +1308,8 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1
+; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
@@ -1313,19 +1317,21 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
-; EG-NEXT: LSHR T0.W, T4.X, literal.x,
-; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: BFE_INT T4.X, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T0.W, PV.Y, literal.x,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
-; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T4.Y, PV.W, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_v2i16_to_v2i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
-; CM-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T4.X
+; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
@@ -1333,11 +1339,13 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T4.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
-; CM-NEXT: LSHR * T0.W, T4.X, literal.x,
+; CM-NEXT: MOV * T2.X, T4.X,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: BFE_INT T4.X, PV.Y, 0.0, literal.x,
+; CM-NEXT: LSHR * T0.W, PV.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
-; CM-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y,
+; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
+; CM-NEXT: BFE_INT * T4.Y, PV.W, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <2 x i16>, ptr addrspace(1) %in
%ext = sext <2 x i16> %load to <2 x i32>
@@ -1636,7 +1644,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -1645,13 +1653,17 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T5.W, T5.Y, literal.x,
+; EG-NEXT: MOV T2.X, T5.X,
+; EG-NEXT: MOV * T3.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV * T0.Z, PS,
+; EG-NEXT: LSHR * T5.W, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T5.Z, T5.Y, literal.x,
+; EG-NEXT: AND_INT * T5.Z, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR * T5.Y, T5.X, literal.x,
+; EG-NEXT: LSHR * T5.Y, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T5.X, T5.X, literal.x,
+; EG-NEXT: AND_INT T5.X, T0.Y, literal.x,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
@@ -1659,7 +1671,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
-; CM-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
@@ -1668,13 +1680,17 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: LSHR * T5.W, T5.Y, literal.x,
+; CM-NEXT: MOV * T2.X, T5.X,
+; CM-NEXT: MOV T3.X, T5.Y,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: MOV * T0.Z, PV.X,
+; CM-NEXT: LSHR * T5.W, PV.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT * T5.Z, T5.Y, literal.x,
+; CM-NEXT: AND_INT * T5.Z, T0.Z, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: LSHR * T5.Y, T5.X, literal.x,
+; CM-NEXT: LSHR * T5.Y, T0.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT * T5.X, T5.X, literal.x,
+; CM-NEXT: AND_INT * T5.X, T0.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -1752,8 +1768,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
+; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
@@ -1761,24 +1777,28 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x,
+; EG-NEXT: MOV T2.X, T5.X,
+; EG-NEXT: MOV * T3.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV * T0.Z, PS,
+; EG-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T5.Y, literal.x,
+; EG-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T5.X, literal.x,
+; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
-; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y,
+; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT * T5.Y, PS, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v4i16_to_v4i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
-; CM-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T5.X
+; CM-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
@@ -1786,16 +1806,20 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x,
+; CM-NEXT: MOV * T2.X, T5.X,
+; CM-NEXT: MOV T3.X, T5.Y,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: MOV * T0.Z, PV.X,
+; CM-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x,
-; CM-NEXT: LSHR * T0.W, T5.Y, literal.x,
+; CM-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x,
+; CM-NEXT: LSHR * T0.W, T0.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T0.Z, T5.X, literal.x,
-; CM-NEXT: BFE_INT * T6.W, PV.W, 0.0, literal.x,
+; CM-NEXT: LSHR T0.Z, T0.Y, literal.x,
+; CM-NEXT: BFE_INT * T5.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
-; CM-NEXT: BFE_INT * T6.Y, PV.Z, 0.0, literal.y,
+; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
+; CM-NEXT: BFE_INT * T5.Y, PV.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <4 x i16>, ptr addrspace(1) %in
%ext = sext <4 x i16> %load to <4 x i32>
@@ -1887,29 +1911,37 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
+; EG-NEXT: ALU 25, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T8.W, T7.Y, literal.x,
+; EG-NEXT: MOV T2.X, T7.X,
+; EG-NEXT: MOV * T3.X, T7.Y,
+; EG-NEXT: MOV T4.X, T7.Z,
+; EG-NEXT: MOV * T5.X, T7.W,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: MOV T0.W, T2.X,
+; EG-NEXT: MOV * T1.Y, T3.X,
+; EG-NEXT: LSHR * T7.W, PS, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T8.Z, T7.Y, literal.x,
+; EG-NEXT: AND_INT * T7.Z, T1.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T8.Y, T7.X, literal.x,
-; EG-NEXT: LSHR * T9.W, T7.W, literal.x,
+; EG-NEXT: LSHR T7.Y, T0.W, literal.x,
+; EG-NEXT: LSHR * T8.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T8.X, T7.X, literal.x,
-; EG-NEXT: AND_INT T9.Z, T7.W, literal.x,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T7.X, T0.W, literal.x,
+; EG-NEXT: AND_INT T8.Z, T0.Z, literal.x,
+; EG-NEXT: LSHR * T9.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT: LSHR * T9.Y, T7.Z, literal.x,
+; EG-NEXT: LSHR * T8.Y, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T9.X, T7.Z, literal.x,
+; EG-NEXT: AND_INT T8.X, T0.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: LSHR * T10.X, PV.W, literal.x,
@@ -1919,30 +1951,38 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
-; CM-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T10.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T9.X
+; CM-NEXT: ALU 25, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T10.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T9.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T7.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: LSHR * T8.W, T7.W, literal.x,
+; CM-NEXT: MOV * T2.X, T7.X,
+; CM-NEXT: MOV * T3.X, T7.Y,
+; CM-NEXT: MOV * T4.X, T7.Z,
+; CM-NEXT: MOV T5.X, T7.W,
+; CM-NEXT: MOV T0.Y, T2.X,
+; CM-NEXT: MOV T0.Z, T3.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T0.W, PV.X,
+; CM-NEXT: MOV * T1.Y, PV.X,
+; CM-NEXT: LSHR * T7.W, PV.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT * T8.Z, T7.W, literal.x,
+; CM-NEXT: AND_INT * T7.Z, T1.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: LSHR T8.Y, T7.Z, literal.x,
-; CM-NEXT: LSHR * T7.W, T7.Y, literal.x,
+; CM-NEXT: LSHR T7.Y, T0.W, literal.x,
+; CM-NEXT: LSHR * T8.W, T0.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T8.X, T7.Z, literal.x,
-; CM-NEXT: AND_INT T7.Z, T7.Y, literal.x,
+; CM-NEXT: AND_INT T7.X, T0.W, literal.x,
+; CM-NEXT: AND_INT T8.Z, T0.Z, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: LSHR T9.X, PV.W, literal.x,
-; CM-NEXT: LSHR * T7.Y, T7.X, literal.y,
+; CM-NEXT: LSHR * T8.Y, T0.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT * T7.X, T7.X, literal.x,
+; CM-NEXT: AND_INT * T8.X, T0.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T10.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -2036,68 +2076,84 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
+; EG-NEXT: ALU 27, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT * T8.Z, T7.Y, 0.0, literal.x,
+; EG-NEXT: MOV T2.X, T7.X,
+; EG-NEXT: MOV * T3.X, T7.Y,
+; EG-NEXT: MOV T4.X, T7.Z,
+; EG-NEXT: MOV * T5.X, T7.W,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: MOV T0.W, T2.X,
+; EG-NEXT: MOV * T1.Y, T3.X,
+; EG-NEXT: BFE_INT * T7.Z, PS, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T8.X, T7.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T9.Z, T7.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T7.Y, literal.x,
+; EG-NEXT: BFE_INT T7.X, T0.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T8.Z, T0.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T9.X, T7.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Z, T7.W, literal.x,
-; EG-NEXT: BFE_INT T8.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT: BFE_INT T8.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T0.Z, T0.Z, literal.x,
+; EG-NEXT: BFE_INT T7.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT: BFE_INT T8.Y, PS, 0.0, literal.y,
-; EG-NEXT: LSHR T1.Z, T7.Z, literal.y,
-; EG-NEXT: BFE_INT T9.W, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T7.Y, PS, 0.0, literal.y,
+; EG-NEXT: LSHR T1.Z, T0.Y, literal.y,
+; EG-NEXT: BFE_INT T8.W, PV.Z, 0.0, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T10.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: BFE_INT * T8.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v8i16_to_v8i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
-; CM-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T9, T7.X
+; CM-NEXT: ALU 27, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T10.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T9.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T7.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: BFE_INT * T8.Z, T7.W, 0.0, literal.x,
+; CM-NEXT: MOV * T2.X, T7.X,
+; CM-NEXT: MOV * T3.X, T7.Y,
+; CM-NEXT: MOV * T4.X, T7.Z,
+; CM-NEXT: MOV T5.X, T7.W,
+; CM-NEXT: MOV T0.Y, T2.X,
+; CM-NEXT: MOV T0.Z, T3.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T0.W, PV.X,
+; CM-NEXT: MOV * T1.Y, PV.X,
+; CM-NEXT: BFE_INT * T7.Z, PV.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T8.X, T7.Z, 0.0, literal.x,
-; CM-NEXT: LSHR T0.Y, T7.Y, literal.x,
-; CM-NEXT: BFE_INT T9.Z, T7.Y, 0.0, literal.x,
-; CM-NEXT: LSHR * T0.W, T7.W, literal.x,
+; CM-NEXT: BFE_INT T7.X, T0.W, 0.0, literal.x,
+; CM-NEXT: LSHR T2.Y, T0.Z, literal.x,
+; CM-NEXT: BFE_INT T8.Z, T0.Z, 0.0, literal.x,
+; CM-NEXT: LSHR * T1.W, T1.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T9.X, T7.X, 0.0, literal.x,
-; CM-NEXT: LSHR T1.Y, T7.Z, literal.x,
+; CM-NEXT: BFE_INT T8.X, T0.Y, 0.0, literal.x,
+; CM-NEXT: LSHR T1.Y, T0.W, literal.x,
; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
-; CM-NEXT: BFE_INT * T8.W, PV.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T7.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T10.X, PV.Z, literal.x,
-; CM-NEXT: BFE_INT T8.Y, PV.Y, 0.0, literal.y,
-; CM-NEXT: LSHR T0.Z, T7.X, literal.y,
-; CM-NEXT: BFE_INT * T9.W, T0.Y, 0.0, literal.y,
+; CM-NEXT: LSHR T9.X, PV.Z, literal.x,
+; CM-NEXT: BFE_INT T7.Y, PV.Y, 0.0, literal.y,
+; CM-NEXT: LSHR T0.Z, T0.Y, literal.y,
+; CM-NEXT: BFE_INT * T8.W, T2.Y, 0.0, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
-; CM-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
+; CM-NEXT: LSHR T10.X, KC0[2].Y, literal.x,
+; CM-NEXT: BFE_INT * T8.Y, PV.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <8 x i16>, ptr addrspace(1) %in
%ext = sext <8 x i16> %load to <8 x i32>
@@ -2234,50 +2290,70 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
;
; EG-LABEL: global_zextload_v16i16_to_v16i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @8
-; EG-NEXT: ALU 35, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T12.X, 1
+; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @10
+; EG-NEXT: ALU 3, @15, KC0[], KC1[]
+; EG-NEXT: TEX 0 @12
+; EG-NEXT: ALU 47, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T17.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 8:
+; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
-; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 13:
-; EG-NEXT: LSHR * T13.W, T12.Y, literal.x,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: MOV T6.X, T12.X,
+; EG-NEXT: MOV * T7.X, T12.Y,
+; EG-NEXT: MOV T8.X, T12.Z,
+; EG-NEXT: MOV * T9.X, T12.W,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: MOV T2.X, T11.X,
+; EG-NEXT: MOV * T3.X, T11.Y,
+; EG-NEXT: MOV T4.X, T11.Z,
+; EG-NEXT: MOV * T5.X, T11.W,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: MOV T0.W, T2.X,
+; EG-NEXT: MOV * T1.Y, T3.X,
+; EG-NEXT: MOV T1.Z, T8.X,
+; EG-NEXT: MOV * T1.W, T9.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.Y, T6.X,
+; EG-NEXT: MOV * T2.Z, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T11.W, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T13.Z, T12.Y, literal.x,
+; EG-NEXT: AND_INT * T11.Z, T2.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T13.Y, T12.X, literal.x,
-; EG-NEXT: LSHR * T14.W, T12.W, literal.x,
+; EG-NEXT: LSHR T11.Y, T2.Y, literal.x,
+; EG-NEXT: LSHR * T12.W, T1.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T13.X, T12.X, literal.x,
-; EG-NEXT: AND_INT T14.Z, T12.W, literal.x,
-; EG-NEXT: LSHR * T12.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T11.X, T2.Y, literal.x,
+; EG-NEXT: AND_INT T12.Z, T1.W, literal.x,
+; EG-NEXT: LSHR * T13.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT: LSHR T14.Y, T12.Z, literal.x,
-; EG-NEXT: LSHR * T15.W, T11.Y, literal.x,
+; EG-NEXT: LSHR T12.Y, T1.Z, literal.x,
+; EG-NEXT: LSHR * T14.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T14.X, T12.Z, literal.x,
-; EG-NEXT: AND_INT T15.Z, T11.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T12.X, T1.Z, literal.x,
+; EG-NEXT: AND_INT T14.Z, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T16.X, PV.W, literal.x,
-; EG-NEXT: LSHR T15.Y, T11.X, literal.y,
-; EG-NEXT: LSHR T17.W, T11.W, literal.y,
-; EG-NEXT: AND_INT * T15.X, T11.X, literal.z,
+; EG-NEXT: LSHR T15.X, PV.W, literal.x,
+; EG-NEXT: LSHR T14.Y, T0.W, literal.y,
+; EG-NEXT: LSHR T16.W, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T14.X, T0.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T17.Z, T11.W, literal.x,
+; EG-NEXT: AND_INT T16.Z, T0.Z, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT: LSHR T11.X, PV.W, literal.x,
-; EG-NEXT: LSHR T17.Y, T11.Z, literal.y,
-; EG-NEXT: AND_INT * T17.X, T11.Z, literal.z,
+; EG-NEXT: LSHR T17.X, PV.W, literal.x,
+; EG-NEXT: LSHR T16.Y, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T16.X, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
@@ -2287,51 +2363,71 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
;
; CM-LABEL: global_zextload_v16i16_to_v16i32:
; CM: ; %bb.0:
-; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 1 @8
-; CM-NEXT: ALU 33, @13, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T18.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T15, T17.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T16.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T14.X
+; CM-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 0 @10
+; CM-NEXT: ALU 3, @15, KC0[], KC1[]
+; CM-NEXT: TEX 0 @12
+; CM-NEXT: ALU 45, @19, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T16, T18.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T14, T17.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T15.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T13.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 8:
-; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
-; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; CM-NEXT: ALU clause starting at 12:
+; CM-NEXT: Fetch clause starting at 10:
+; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; CM-NEXT: Fetch clause starting at 12:
+; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; CM-NEXT: ALU clause starting at 14:
; CM-NEXT: MOV * T11.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 13:
-; CM-NEXT: LSHR * T13.W, T12.W, literal.x,
+; CM-NEXT: ALU clause starting at 15:
+; CM-NEXT: MOV * T6.X, T12.X,
+; CM-NEXT: MOV * T7.X, T12.Y,
+; CM-NEXT: MOV * T8.X, T12.Z,
+; CM-NEXT: MOV * T9.X, T12.W,
+; CM-NEXT: ALU clause starting at 19:
+; CM-NEXT: MOV * T2.X, T11.X,
+; CM-NEXT: MOV * T3.X, T11.Y,
+; CM-NEXT: MOV * T4.X, T11.Z,
+; CM-NEXT: MOV T5.X, T11.W,
+; CM-NEXT: MOV T0.Y, T6.X,
+; CM-NEXT: MOV T0.Z, T7.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T0.W, T8.X, BS:VEC_201
+; CM-NEXT: MOV T1.Y, T9.X,
+; CM-NEXT: MOV T1.Z, T2.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T1.W, T3.X, BS:VEC_201
+; CM-NEXT: MOV T2.Y, T4.X,
+; CM-NEXT: MOV * T2.Z, T5.X, BS:VEC_120/SCL_212
+; CM-NEXT: LSHR * T11.W, PV.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT * T13.Z, T12.W, literal.x,
+; CM-NEXT: AND_INT * T11.Z, T2.Z, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: LSHR T13.Y, T12.Z, literal.x,
-; CM-NEXT: LSHR * T12.W, T12.Y, literal.x,
+; CM-NEXT: LSHR T11.Y, T2.Y, literal.x,
+; CM-NEXT: LSHR * T12.W, T1.W, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T13.X, T12.Z, literal.x,
-; CM-NEXT: AND_INT T12.Z, T12.Y, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T11.X, T2.Y, literal.x,
+; CM-NEXT: AND_INT T12.Z, T1.W, literal.x,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
-; CM-NEXT: LSHR T14.X, PV.W, literal.x,
-; CM-NEXT: LSHR T12.Y, T12.X, literal.y,
-; CM-NEXT: LSHR * T15.W, T11.W, literal.y,
+; CM-NEXT: LSHR T13.X, PV.W, literal.x,
+; CM-NEXT: LSHR T12.Y, T1.Z, literal.y,
+; CM-NEXT: LSHR * T14.W, T1.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T12.X, T12.X, literal.x,
-; CM-NEXT: AND_INT T15.Z, T11.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T12.X, T1.Z, literal.x,
+; CM-NEXT: AND_INT T14.Z, T1.Y, literal.x,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
-; CM-NEXT: LSHR T16.X, PV.W, literal.x,
-; CM-NEXT: LSHR T15.Y, T11.Z, literal.y,
-; CM-NEXT: LSHR * T11.W, T11.Y, literal.y,
+; CM-NEXT: LSHR T15.X, PV.W, literal.x,
+; CM-NEXT: LSHR T14.Y, T0.W, literal.y,
+; CM-NEXT: LSHR * T16.W, T0.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T15.X, T11.Z, literal.x,
-; CM-NEXT: AND_INT T11.Z, T11.Y, literal.x,
+; CM-NEXT: AND_INT T14.X, T0.W, literal.x,
+; CM-NEXT: AND_INT T16.Z, T0.Z, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: LSHR T17.X, PV.W, literal.x,
-; CM-NEXT: LSHR * T11.Y, T11.X, literal.y,
+; CM-NEXT: LSHR * T16.Y, T0.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT * T11.X, T11.X, literal.x,
+; CM-NEXT: AND_INT * T16.X, T0.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T18.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -2470,117 +2566,157 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
;
; EG-LABEL: global_sextload_v16i16_to_v16i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @8
-; EG-NEXT: ALU 39, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T13.X, 1
+; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @10
+; EG-NEXT: ALU 3, @15, KC0[], KC1[]
+; EG-NEXT: TEX 0 @12
+; EG-NEXT: ALU 51, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T17.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
-; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: Fetch clause starting at 10:
+; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 13:
-; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: MOV T6.X, T12.X,
+; EG-NEXT: MOV * T7.X, T12.Y,
+; EG-NEXT: MOV T8.X, T12.Z,
+; EG-NEXT: MOV * T9.X, T12.W,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: MOV T2.X, T11.X,
+; EG-NEXT: MOV * T3.X, T11.Y,
+; EG-NEXT: MOV T4.X, T11.Z,
+; EG-NEXT: MOV * T5.X, T11.W,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, T2.X,
+; EG-NEXT: MOV T0.W, T8.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.Y, T3.X,
+; EG-NEXT: MOV T1.Z, T9.X,
+; EG-NEXT: MOV * T1.W, T5.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.Y, T6.X,
+; EG-NEXT: MOV * T2.Z, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T14.X, PV.W, literal.x,
-; EG-NEXT: BFE_INT * T15.Z, T11.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T12.X, PV.W, literal.x,
+; EG-NEXT: BFE_INT * T13.Z, T2.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T15.X, T11.X, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Y, T12.W, literal.x,
-; EG-NEXT: BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T0.W, T12.Y, literal.x,
-; EG-NEXT: LSHR * T1.W, T11.Y, literal.x,
+; EG-NEXT: BFE_INT T13.X, T2.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T3.Y, T1.W, literal.x,
+; EG-NEXT: BFE_INT T14.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T2.W, T1.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T3.W, T2.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T16.X, T11.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T1.Y, T11.W, literal.x,
-; EG-NEXT: BFE_INT T17.Z, T12.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T15.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T11.X, literal.x,
+; EG-NEXT: BFE_INT T14.X, T0.W, 0.0, literal.x,
+; EG-NEXT: LSHR T4.Y, T1.Z, literal.x,
+; EG-NEXT: BFE_INT T15.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T13.W, PS, 0.0, literal.x,
+; EG-NEXT: LSHR * T3.W, T2.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T17.X, T12.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T15.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T18.Z, T12.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T16.W, PV.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T11.Z, literal.x,
+; EG-NEXT: BFE_INT T15.X, T0.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T13.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T16.Z, T1.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T14.W, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T18.X, T12.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T16.Y, PS, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Z, T12.X, literal.x,
-; EG-NEXT: BFE_INT T17.W, T0.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T16.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T14.Y, PS, 0.0, literal.x,
+; EG-NEXT: LSHR T0.Z, T0.Z, literal.x,
+; EG-NEXT: BFE_INT T15.W, T2.W, 0.0, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
-; EG-NEXT: LSHR T11.X, PS, literal.x,
-; EG-NEXT: BFE_INT T17.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT: LSHR T0.Z, T12.Z, literal.y,
-; EG-NEXT: BFE_INT T18.W, T0.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T17.X, PS, literal.x,
+; EG-NEXT: BFE_INT T15.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T0.Z, T0.Y, literal.y,
+; EG-NEXT: BFE_INT T16.W, T3.Y, 0.0, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T12.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T18.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T16.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v16i16_to_v16i32:
; CM: ; %bb.0:
-; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 1 @8
-; CM-NEXT: ALU 40, @13, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T17, T11.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T18.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T16, T14.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T15, T13.X
+; CM-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 0 @10
+; CM-NEXT: ALU 3, @15, KC0[], KC1[]
+; CM-NEXT: TEX 0 @12
+; CM-NEXT: ALU 52, @19, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T16, T18.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T15, T17.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T14, T12.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T11.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 8:
-; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
-; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; CM-NEXT: ALU clause starting at 12:
+; CM-NEXT: Fetch clause starting at 10:
+; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; CM-NEXT: Fetch clause starting at 12:
+; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; CM-NEXT: ALU clause starting at 14:
; CM-NEXT: MOV * T11.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 13:
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU clause starting at 15:
+; CM-NEXT: MOV * T6.X, T12.X,
+; CM-NEXT: MOV * T7.X, T12.Y,
+; CM-NEXT: MOV * T8.X, T12.Z,
+; CM-NEXT: MOV * T9.X, T12.W,
+; CM-NEXT: ALU clause starting at 19:
+; CM-NEXT: MOV * T2.X, T11.X,
+; CM-NEXT: MOV * T3.X, T11.Y,
+; CM-NEXT: MOV * T4.X, T11.Z,
+; CM-NEXT: MOV T5.X, T11.W,
+; CM-NEXT: MOV T0.Y, T6.X,
+; CM-NEXT: MOV T0.Z, T3.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T0.W, PV.X,
+; CM-NEXT: MOV T1.Y, T2.X,
+; CM-NEXT: MOV T1.Z, PV.X,
+; CM-NEXT: MOV * T1.W, T9.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV T2.Y, T8.X,
+; CM-NEXT: MOV T2.Z, T7.X, BS:VEC_120/SCL_212
+; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T13.X, PV.W, literal.x,
-; CM-NEXT: LSHR T0.Y, T11.Y, literal.y,
-; CM-NEXT: LSHR T0.Z, T11.Z, literal.y,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T11.X, PV.W, literal.x,
+; CM-NEXT: LSHR T3.Y, PV.Z, literal.y,
+; CM-NEXT: LSHR T3.Z, PV.Y, literal.y,
+; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T14.X, PV.W, literal.x,
-; CM-NEXT: LSHR T1.Y, T11.W, literal.y,
-; CM-NEXT: BFE_INT T15.Z, T12.W, 0.0, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT: LSHR * T0.W, T12.X, literal.y,
+; CM-NEXT: LSHR T12.X, PV.W, literal.x,
+; CM-NEXT: LSHR T4.Y, T1.W, literal.y,
+; CM-NEXT: BFE_INT T13.Z, T1.Z, 0.0, literal.y,
+; CM-NEXT: LSHR * T2.W, T1.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: BFE_INT T15.X, T12.Z, 0.0, literal.x,
-; CM-NEXT: LSHR T2.Y, T12.Y, literal.x,
-; CM-NEXT: BFE_INT T16.Z, T12.Y, 0.0, literal.x,
-; CM-NEXT: LSHR * T1.W, T12.W, literal.x,
+; CM-NEXT: BFE_INT T13.X, T0.W, 0.0, literal.x,
+; CM-NEXT: LSHR T5.Y, T0.Z, literal.x,
+; CM-NEXT: BFE_INT T14.Z, T0.Z, 0.0, literal.x,
+; CM-NEXT: LSHR * T3.W, T1.Z, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T16.X, T12.X, 0.0, literal.x,
-; CM-NEXT: LSHR T3.Y, T12.Z, literal.x,
-; CM-NEXT: BFE_INT T12.Z, T11.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T15.W, PV.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T14.X, T1.Y, 0.0, literal.x,
+; CM-NEXT: LSHR T1.Y, T0.W, literal.x,
+; CM-NEXT: BFE_INT T15.Z, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT * T13.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T12.X, T11.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T15.Y, PV.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT T17.Z, T11.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T16.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T15.X, T2.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T13.Y, PV.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T16.Z, T2.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T14.W, T5.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T17.X, T11.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T16.Y, T0.W, 0.0, literal.x,
-; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
-; CM-NEXT: BFE_INT * T12.W, T1.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T16.X, T0.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T14.Y, T2.W, 0.0, literal.x,
+; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
+; CM-NEXT: BFE_INT * T15.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T18.X, PV.Z, literal.x,
-; CM-NEXT: BFE_INT T12.Y, T0.Z, 0.0, literal.y,
-; CM-NEXT: LSHR T0.Z, T11.X, literal.y,
-; CM-NEXT: BFE_INT * T17.W, T0.Y, 0.0, literal.y,
+; CM-NEXT: LSHR T17.X, PV.Z, literal.x,
+; CM-NEXT: BFE_INT T15.Y, T3.Z, 0.0, literal.y,
+; CM-NEXT: LSHR T0.Z, T0.Y, literal.y,
+; CM-NEXT: BFE_INT * T16.W, T3.Y, 0.0, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
-; CM-NEXT: BFE_INT * T17.Y, PV.Z, 0.0, literal.y,
+; CM-NEXT: LSHR T18.X, KC0[2].Y, literal.x,
+; CM-NEXT: BFE_INT * T16.Y, PV.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <16 x i16>, ptr addrspace(1) %in
%ext = sext <16 x i16> %load to <16 x i32>
@@ -2811,93 +2947,137 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
;
; EG-LABEL: global_zextload_v32i16_to_v32i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 3 @12
-; EG-NEXT: ALU 72, @21, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T34.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T33.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T32.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T30.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T29.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T27.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T24.X, 1
+; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @18
+; EG-NEXT: ALU 3, @27, KC0[], KC1[]
+; EG-NEXT: TEX 0 @20
+; EG-NEXT: ALU 3, @31, KC0[], KC1[]
+; EG-NEXT: TEX 0 @22
+; EG-NEXT: ALU 3, @35, KC0[], KC1[]
+; EG-NEXT: TEX 0 @24
+; EG-NEXT: ALU 92, @39, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T34.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T33.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T31.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T27.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: Fetch clause starting at 18:
; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
-; EG-NEXT: VTX_READ_128 T21.XYZW, T19.X, 48, #1
-; EG-NEXT: VTX_READ_128 T22.XYZW, T19.X, 32, #1
-; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 16, #1
-; EG-NEXT: ALU clause starting at 20:
+; EG-NEXT: Fetch clause starting at 20:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT: Fetch clause starting at 22:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; EG-NEXT: Fetch clause starting at 24:
+; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; EG-NEXT: ALU clause starting at 26:
; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 21:
-; EG-NEXT: LSHR * T23.W, T20.W, literal.x,
+; EG-NEXT: ALU clause starting at 27:
+; EG-NEXT: MOV T16.X, T20.Z,
+; EG-NEXT: MOV * T17.X, T20.W,
+; EG-NEXT: MOV T14.X, T20.X,
+; EG-NEXT: MOV * T15.X, T20.Y,
+; EG-NEXT: ALU clause starting at 31:
+; EG-NEXT: MOV T12.X, T20.Z,
+; EG-NEXT: MOV * T13.X, T20.W,
+; EG-NEXT: MOV T10.X, T20.X,
+; EG-NEXT: MOV * T11.X, T20.Y,
+; EG-NEXT: ALU clause starting at 35:
+; EG-NEXT: MOV T8.X, T20.Z,
+; EG-NEXT: MOV * T9.X, T20.W,
+; EG-NEXT: MOV T6.X, T20.X,
+; EG-NEXT: MOV * T7.X, T20.Y,
+; EG-NEXT: ALU clause starting at 39:
+; EG-NEXT: MOV T4.X, T19.Z,
+; EG-NEXT: MOV * T5.X, T19.W,
+; EG-NEXT: MOV T2.X, T19.X,
+; EG-NEXT: MOV * T3.X, T19.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: MOV T0.W, T4.X,
+; EG-NEXT: MOV * T1.Y, T5.X,
+; EG-NEXT: MOV T1.Z, T6.X,
+; EG-NEXT: MOV * T1.W, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.Y, T8.X,
+; EG-NEXT: MOV T2.Z, T9.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T10.X, BS:VEC_201
+; EG-NEXT: MOV T3.Y, T11.X,
+; EG-NEXT: MOV T3.Z, T12.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T13.X, BS:VEC_201
+; EG-NEXT: MOV T4.Y, T14.X,
+; EG-NEXT: MOV T4.Z, T15.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T4.W, T16.X, BS:VEC_201
+; EG-NEXT: MOV * T5.Y, T17.X,
+; EG-NEXT: LSHR * T19.W, PV.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T23.Z, T20.W, literal.x,
+; EG-NEXT: AND_INT * T19.Z, T5.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T23.Y, T20.Z, literal.x,
-; EG-NEXT: LSHR * T20.W, T20.Y, literal.x,
+; EG-NEXT: LSHR T19.Y, T4.W, literal.x,
+; EG-NEXT: LSHR * T20.W, T4.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T23.X, T20.Z, literal.x,
-; EG-NEXT: AND_INT T20.Z, T20.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T19.X, T4.W, literal.x,
+; EG-NEXT: AND_INT T20.Z, T4.Z, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T24.X, PV.W, literal.x,
-; EG-NEXT: LSHR T20.Y, T20.X, literal.y,
-; EG-NEXT: LSHR T25.W, T19.W, literal.y,
-; EG-NEXT: AND_INT * T20.X, T20.X, literal.z,
+; EG-NEXT: LSHR T21.X, PV.W, literal.x,
+; EG-NEXT: LSHR T20.Y, T4.Y, literal.y,
+; EG-NEXT: LSHR T22.W, T3.W, literal.y,
+; EG-NEXT: AND_INT * T20.X, T4.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T25.Z, T19.W, literal.x,
+; EG-NEXT: AND_INT * T22.Z, T3.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T26.X, KC0[2].Y, literal.x,
-; EG-NEXT: LSHR T25.Y, T19.Z, literal.y,
-; EG-NEXT: LSHR T19.W, T19.Y, literal.y,
-; EG-NEXT: AND_INT * T25.X, T19.Z, literal.z,
+; EG-NEXT: LSHR T23.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T22.Y, T3.Z, literal.y,
+; EG-NEXT: LSHR T24.W, T3.Y, literal.y,
+; EG-NEXT: AND_INT * T22.X, T3.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T19.Z, T19.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T24.Z, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
-; EG-NEXT: LSHR T27.X, PV.W, literal.x,
-; EG-NEXT: LSHR T19.Y, T19.X, literal.y,
-; EG-NEXT: LSHR T28.W, T22.W, literal.y,
-; EG-NEXT: AND_INT * T19.X, T19.X, literal.z,
+; EG-NEXT: LSHR T25.X, PV.W, literal.x,
+; EG-NEXT: LSHR T24.Y, T2.W, literal.y,
+; EG-NEXT: LSHR T26.W, T2.Z, literal.y,
+; EG-NEXT: AND_INT * T24.X, T2.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T28.Z, T22.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T26.Z, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT: LSHR T29.X, PV.W, literal.x,
-; EG-NEXT: LSHR T28.Y, T22.Z, literal.y,
-; EG-NEXT: LSHR T22.W, T22.Y, literal.y,
-; EG-NEXT: AND_INT * T28.X, T22.Z, literal.z,
+; EG-NEXT: LSHR T27.X, PV.W, literal.x,
+; EG-NEXT: LSHR T26.Y, T2.Y, literal.y,
+; EG-NEXT: LSHR T28.W, T1.W, literal.y,
+; EG-NEXT: AND_INT * T26.X, T2.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T22.Z, T22.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T28.Z, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
-; EG-NEXT: LSHR T30.X, PV.W, literal.x,
-; EG-NEXT: LSHR T22.Y, T22.X, literal.y,
-; EG-NEXT: LSHR T31.W, T21.W, literal.y,
-; EG-NEXT: AND_INT * T22.X, T22.X, literal.z,
+; EG-NEXT: LSHR T29.X, PV.W, literal.x,
+; EG-NEXT: LSHR T28.Y, T1.Z, literal.y,
+; EG-NEXT: LSHR T30.W, T1.Y, literal.y,
+; EG-NEXT: AND_INT * T28.X, T1.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T31.Z, T21.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T30.Z, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
-; EG-NEXT: LSHR T32.X, PV.W, literal.x,
-; EG-NEXT: LSHR T31.Y, T21.Z, literal.y,
-; EG-NEXT: LSHR T21.W, T21.Y, literal.y,
-; EG-NEXT: AND_INT * T31.X, T21.Z, literal.z,
+; EG-NEXT: LSHR T31.X, PV.W, literal.x,
+; EG-NEXT: LSHR T30.Y, T0.W, literal.y,
+; EG-NEXT: LSHR T32.W, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T30.X, T0.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T21.Z, T21.Y, literal.x,
+; EG-NEXT: AND_INT T32.Z, T0.Z, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
; EG-NEXT: LSHR T33.X, PV.W, literal.x,
-; EG-NEXT: LSHR T21.Y, T21.X, literal.y,
-; EG-NEXT: AND_INT * T21.X, T21.X, literal.z,
+; EG-NEXT: LSHR T32.Y, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T32.X, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
@@ -2907,88 +3087,132 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
;
; CM-LABEL: global_zextload_v32i16_to_v32i32:
; CM: ; %bb.0:
-; CM-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 3 @12
-; CM-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T34.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T21.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T32.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T22.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T29.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T19.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T26.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T20.X
+; CM-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 0 @18
+; CM-NEXT: ALU 3, @27, KC0[], KC1[]
+; CM-NEXT: TEX 0 @20
+; CM-NEXT: ALU 3, @31, KC0[], KC1[]
+; CM-NEXT: TEX 0 @22
+; CM-NEXT: ALU 3, @35, KC0[], KC1[]
+; CM-NEXT: TEX 0 @24
+; CM-NEXT: ALU 85, @39, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T34.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T33.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T31.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T29.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T27.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T25.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T23.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T21.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 12:
-; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 48, #1
-; CM-NEXT: VTX_READ_128 T21.XYZW, T19.X, 0, #1
-; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 16, #1
-; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 32, #1
-; CM-NEXT: ALU clause starting at 20:
+; CM-NEXT: Fetch clause starting at 18:
+; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
+; CM-NEXT: Fetch clause starting at 20:
+; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; CM-NEXT: Fetch clause starting at 22:
+; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; CM-NEXT: Fetch clause starting at 24:
+; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; CM-NEXT: ALU clause starting at 26:
; CM-NEXT: MOV * T19.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 21:
-; CM-NEXT: LSHR * T23.W, T20.Y, literal.x,
+; CM-NEXT: ALU clause starting at 27:
+; CM-NEXT: MOV * T16.X, T20.Z,
+; CM-NEXT: MOV * T17.X, T20.W,
+; CM-NEXT: MOV * T14.X, T20.X,
+; CM-NEXT: MOV * T15.X, T20.Y,
+; CM-NEXT: ALU clause starting at 31:
+; CM-NEXT: MOV * T12.X, T20.Z,
+; CM-NEXT: MOV * T13.X, T20.W,
+; CM-NEXT: MOV * T10.X, T20.X,
+; CM-NEXT: MOV * T11.X, T20.Y,
+; CM-NEXT: ALU clause starting at 35:
+; CM-NEXT: MOV * T8.X, T20.Z,
+; CM-NEXT: MOV * T9.X, T20.W,
+; CM-NEXT: MOV * T6.X, T20.X,
+; CM-NEXT: MOV * T7.X, T20.Y,
+; CM-NEXT: ALU clause starting at 39:
+; CM-NEXT: MOV * T4.X, T19.Z,
+; CM-NEXT: MOV * T5.X, T19.W,
+; CM-NEXT: MOV * T2.X, T19.X,
+; CM-NEXT: MOV T3.X, T19.Y,
+; CM-NEXT: MOV T0.Y, T16.X,
+; CM-NEXT: MOV T0.Z, T17.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T0.W, T14.X, BS:VEC_201
+; CM-NEXT: MOV T1.Y, T15.X,
+; CM-NEXT: MOV T1.Z, T12.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T1.W, T13.X, BS:VEC_201
+; CM-NEXT: MOV T2.Y, T10.X,
+; CM-NEXT: MOV T2.Z, T11.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T2.W, T8.X, BS:VEC_201
+; CM-NEXT: MOV T3.Y, T9.X,
+; CM-NEXT: MOV T3.Z, T6.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T3.W, T7.X, BS:VEC_201
+; CM-NEXT: MOV T4.Y, T4.X,
+; CM-NEXT: MOV T4.Z, T5.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T4.W, T2.X, BS:VEC_201
+; CM-NEXT: MOV * T5.Y, T3.X,
+; CM-NEXT: LSHR * T19.W, PV.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT * T23.Z, T20.Y, literal.x,
+; CM-NEXT: AND_INT * T19.Z, T5.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: LSHR T23.Y, T20.X, literal.x,
-; CM-NEXT: LSHR * T24.W, T20.W, literal.x,
+; CM-NEXT: LSHR T19.Y, T4.W, literal.x,
+; CM-NEXT: LSHR * T20.W, T4.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T23.X, T20.X, literal.x,
-; CM-NEXT: AND_INT T24.Z, T20.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T19.X, T4.W, literal.x,
+; CM-NEXT: AND_INT T20.Z, T4.Z, literal.x,
+; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
-; CM-NEXT: LSHR T20.X, PV.W, literal.x,
-; CM-NEXT: LSHR T24.Y, T20.Z, literal.y,
-; CM-NEXT: LSHR * T25.W, T19.Y, literal.y,
+; CM-NEXT: LSHR T21.X, PV.W, literal.x,
+; CM-NEXT: LSHR T20.Y, T4.Y, literal.y,
+; CM-NEXT: LSHR * T22.W, T3.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T24.X, T20.Z, literal.x,
-; CM-NEXT: AND_INT T25.Z, T19.Y, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T20.X, T4.Y, literal.x,
+; CM-NEXT: AND_INT T22.Z, T3.W, literal.x,
+; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
-; CM-NEXT: LSHR T26.X, PV.W, literal.x,
-; CM-NEXT: LSHR T25.Y, T19.X, literal.y,
-; CM-NEXT: LSHR * T27.W, T19.W, literal.y,
+; CM-NEXT: LSHR T23.X, PV.W, literal.x,
+; CM-NEXT: LSHR T22.Y, T3.Z, literal.y,
+; CM-NEXT: LSHR * T24.W, T3.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T25.X, T19.X, literal.x,
-; CM-NEXT: AND_INT T27.Z, T19.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T22.X, T3.Z, literal.x,
+; CM-NEXT: AND_INT T24.Z, T3.Y, literal.x,
+; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
-; CM-NEXT: LSHR T19.X, PV.W, literal.x,
-; CM-NEXT: LSHR T27.Y, T19.Z, literal.y,
-; CM-NEXT: LSHR * T28.W, T22.Y, literal.y,
+; CM-NEXT: LSHR T25.X, PV.W, literal.x,
+; CM-NEXT: LSHR T24.Y, T2.W, literal.y,
+; CM-NEXT: LSHR * T26.W, T2.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T27.X, T19.Z, literal.x,
-; CM-NEXT: AND_INT T28.Z, T22.Y, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T24.X, T2.W, literal.x,
+; CM-NEXT: AND_INT T26.Z, T2.Z, literal.x,
+; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
-; CM-NEXT: LSHR T29.X, PV.W, literal.x,
-; CM-NEXT: LSHR T28.Y, T22.X, literal.y,
-; CM-NEXT: LSHR * T30.W, T22.W, literal.y,
+; CM-NEXT: LSHR T27.X, PV.W, literal.x,
+; CM-NEXT: LSHR T26.Y, T2.Y, literal.y,
+; CM-NEXT: LSHR * T28.W, T1.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T28.X, T22.X, literal.x,
-; CM-NEXT: AND_INT T30.Z, T22.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T26.X, T2.Y, literal.x,
+; CM-NEXT: AND_INT T28.Z, T1.W, literal.x,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
-; CM-NEXT: LSHR T22.X, PV.W, literal.x,
-; CM-NEXT: LSHR T30.Y, T22.Z, literal.y,
-; CM-NEXT: LSHR * T31.W, T21.Y, literal.y,
+; CM-NEXT: LSHR T29.X, PV.W, literal.x,
+; CM-NEXT: LSHR T28.Y, T1.Z, literal.y,
+; CM-NEXT: LSHR * T30.W, T1.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T30.X, T22.Z, literal.x,
-; CM-NEXT: AND_INT T31.Z, T21.Y, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T28.X, T1.Z, literal.x,
+; CM-NEXT: AND_INT T30.Z, T1.Y, literal.x,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
-; CM-NEXT: LSHR T32.X, PV.W, literal.x,
-; CM-NEXT: LSHR T31.Y, T21.X, literal.y,
-; CM-NEXT: LSHR * T33.W, T21.W, literal.y,
+; CM-NEXT: LSHR T31.X, PV.W, literal.x,
+; CM-NEXT: LSHR T30.Y, T0.W, literal.y,
+; CM-NEXT: LSHR * T32.W, T0.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T31.X, T21.X, literal.x,
-; CM-NEXT: AND_INT * T33.Z, T21.W, literal.x,
+; CM-NEXT: AND_INT T30.X, T0.W, literal.x,
+; CM-NEXT: AND_INT * T32.Z, T0.Z, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: LSHR T21.X, KC0[2].Y, literal.x,
-; CM-NEXT: LSHR * T33.Y, T21.Z, literal.y,
+; CM-NEXT: LSHR T33.X, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR * T32.Y, T0.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T33.X, T21.Z, literal.x,
+; CM-NEXT: AND_INT T32.X, T0.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: LSHR * T34.X, PV.W, literal.x,
@@ -3222,220 +3446,312 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
;
; EG-LABEL: global_sextload_v32i16_to_v32i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 9, @20, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 3 @12
-; EG-NEXT: ALU 73, @30, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T34.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T28.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T27.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T19.X, 1
+; EG-NEXT: ALU 0, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @20
+; EG-NEXT: ALU 3, @29, KC0[], KC1[]
+; EG-NEXT: TEX 0 @22
+; EG-NEXT: ALU 3, @33, KC0[], KC1[]
+; EG-NEXT: TEX 0 @24
+; EG-NEXT: ALU 3, @37, KC0[], KC1[]
+; EG-NEXT: TEX 0 @26
+; EG-NEXT: ALU 83, @41, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 19, @125, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T34.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T33.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T22.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T21.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T20.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_128 T23.XYZW, T22.X, 16, #1
-; EG-NEXT: VTX_READ_128 T24.XYZW, T22.X, 32, #1
-; EG-NEXT: VTX_READ_128 T25.XYZW, T22.X, 0, #1
-; EG-NEXT: VTX_READ_128 T22.XYZW, T22.X, 48, #1
-; EG-NEXT: ALU clause starting at 20:
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 20:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
+; EG-NEXT: Fetch clause starting at 22:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT: Fetch clause starting at 24:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; EG-NEXT: Fetch clause starting at 26:
+; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; EG-NEXT: ALU clause starting at 28:
+; EG-NEXT: MOV * T19.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 29:
+; EG-NEXT: MOV T16.X, T20.Z,
+; EG-NEXT: MOV * T17.X, T20.W,
+; EG-NEXT: MOV T14.X, T20.X,
+; EG-NEXT: MOV * T15.X, T20.Y,
+; EG-NEXT: ALU clause starting at 33:
+; EG-NEXT: MOV T12.X, T20.Z,
+; EG-NEXT: MOV * T13.X, T20.W,
+; EG-NEXT: MOV T10.X, T20.X,
+; EG-NEXT: MOV * T11.X, T20.Y,
+; EG-NEXT: ALU clause starting at 37:
+; EG-NEXT: MOV T8.X, T20.Z,
+; EG-NEXT: MOV * T9.X, T20.W,
+; EG-NEXT: MOV T6.X, T20.X,
+; EG-NEXT: MOV * T7.X, T20.Y,
+; EG-NEXT: ALU clause starting at 41:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T19.X, PV.W, literal.x,
-; EG-NEXT: LSHR * T20.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T20.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T21.X, PV.W, literal.x,
-; EG-NEXT: MOV * T22.X, KC0[2].Z,
+; EG-NEXT: LSHR T22.X, PV.W, literal.x,
+; EG-NEXT: MOV * T4.X, T19.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 30:
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T5.X, T19.W,
+; EG-NEXT: MOV * T2.X, T19.X,
+; EG-NEXT: MOV T3.X, T19.Y,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: MOV T0.Z, T4.X,
+; EG-NEXT: MOV T0.W, T6.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.Y, T8.X,
+; EG-NEXT: MOV T1.Z, T10.X,
+; EG-NEXT: MOV * T1.W, T12.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.Y, T14.X,
+; EG-NEXT: MOV T2.Z, T13.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T15.X, BS:VEC_201
+; EG-NEXT: MOV T3.Y, T11.X,
+; EG-NEXT: MOV T3.Z, T16.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T9.X, BS:VEC_201
+; EG-NEXT: MOV T4.Y, T7.X,
+; EG-NEXT: MOV T4.Z, T17.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T4.W, T5.X, BS:VEC_201
+; EG-NEXT: MOV T5.Y, T3.X,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T26.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T19.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT: LSHR T27.X, PV.W, literal.x,
-; EG-NEXT: LSHR T0.W, T22.Y, literal.y,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: LSHR T5.W, T5.Y, literal.y,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T28.X, PS, literal.x,
-; EG-NEXT: LSHR T0.Y, T22.W, literal.y,
-; EG-NEXT: BFE_INT T29.Z, T25.W, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T1.W, T24.Y, literal.y,
-; EG-NEXT: LSHR * T2.W, T24.W, literal.y,
+; EG-NEXT: LSHR T24.X, PS, literal.x,
+; EG-NEXT: LSHR T6.Y, T4.W, literal.y,
+; EG-NEXT: BFE_INT T25.Z, T4.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T6.W, T4.Y, literal.y,
+; EG-NEXT: LSHR * T7.W, T3.W, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T29.X, T25.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T1.Y, T23.Y, literal.x,
-; EG-NEXT: BFE_INT T30.Z, T25.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T3.W, T23.W, literal.x,
-; EG-NEXT: LSHR * T4.W, T25.W, literal.x,
+; EG-NEXT: BFE_INT T25.X, T3.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T7.Y, T3.Y, literal.x,
+; EG-NEXT: BFE_INT T26.Z, T2.W, 0.0, literal.x,
+; EG-NEXT: LSHR T8.W, T2.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T9.W, T4.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T30.X, T25.X, 0.0, literal.x,
-; EG-NEXT: LSHR T2.Y, T25.Y, literal.x,
-; EG-NEXT: BFE_INT T31.Z, T23.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T29.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T4.W, T25.Z, literal.x,
+; EG-NEXT: BFE_INT T26.X, T2.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T8.Y, T2.W, literal.x,
+; EG-NEXT: BFE_INT T27.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T25.W, PS, 0.0, literal.x,
+; EG-NEXT: LSHR * T2.W, T3.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T31.X, T23.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T29.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T25.Z, T23.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T30.W, PV.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T4.W, T25.X, literal.x,
+; EG-NEXT: BFE_INT T27.X, T1.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T25.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T28.Z, T3.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T26.W, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T2.W, T2.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T25.X, T23.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T30.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T32.Z, T24.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T31.W, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T3.W, T23.Z, literal.x,
+; EG-NEXT: BFE_INT T28.X, T1.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T26.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T29.Z, T3.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T27.W, T8.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T1.W, T1.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T32.X, T24.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T31.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T23.Z, T24.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T25.W, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T3.W, T23.X, literal.x,
+; EG-NEXT: BFE_INT T29.X, T1.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T27.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T30.Z, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T28.W, T7.Y, 0.0, literal.x, BS:VEC_201
+; EG-NEXT: LSHR * T1.W, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T23.X, T24.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T25.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T33.Z, T22.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T32.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T2.W, T24.Z, literal.x,
+; EG-NEXT: BFE_INT T30.X, T0.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T28.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T31.Z, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T29.W, T7.W, 0.0, literal.x, BS:VEC_201
+; EG-NEXT: LSHR * T1.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T33.X, T22.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T32.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T24.Z, T22.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T23.W, T1.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T24.X, literal.x,
+; EG-NEXT: BFE_INT T31.X, T0.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T29.Y, PS, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T24.X, T22.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T23.Y, PS, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Z, T22.Z, literal.x,
-; EG-NEXT: BFE_INT T33.W, T0.Y, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: ALU clause starting at 125:
+; EG-NEXT: BFE_INT T32.Z, T5.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T30.W, T6.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T32.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T30.Y, PS, 0.0, literal.x,
+; EG-NEXT: LSHR T0.Z, T0.Z, literal.x,
+; EG-NEXT: BFE_INT T31.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43)
-; EG-NEXT: LSHR T34.X, PS, literal.x,
-; EG-NEXT: BFE_INT T33.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT: LSHR T0.Z, T22.X, literal.y,
-; EG-NEXT: BFE_INT T24.W, T0.W, 0.0, literal.y,
+; EG-NEXT: LSHR T33.X, PS, literal.x,
+; EG-NEXT: BFE_INT T31.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T0.Z, T0.Y, literal.y,
+; EG-NEXT: BFE_INT T32.W, T5.W, 0.0, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T22.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T24.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T34.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T32.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v32i16_to_v32i32:
; CM: ; %bb.0:
-; CM-NEXT: ALU 0, @22, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 0 @14
-; CM-NEXT: ALU 7, @23, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 2 @16
-; CM-NEXT: ALU 76, @31, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T34.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T20.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T28.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T27.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T26.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T25.X
+; CM-NEXT: ALU 0, @28, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 0 @20
+; CM-NEXT: ALU 3, @29, KC0[], KC1[]
+; CM-NEXT: TEX 0 @22
+; CM-NEXT: ALU 3, @33, KC0[], KC1[]
+; CM-NEXT: TEX 0 @24
+; CM-NEXT: ALU 3, @37, KC0[], KC1[]
+; CM-NEXT: TEX 0 @26
+; CM-NEXT: ALU 85, @41, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 19, @127, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T34.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T33.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T24.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T29, T21.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T29, T23.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T22.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T21.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T20.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T19.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 14:
+; CM-NEXT: PAD
+; CM-NEXT: Fetch clause starting at 20:
; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
-; CM-NEXT: Fetch clause starting at 16:
-; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 48, #1
-; CM-NEXT: VTX_READ_128 T23.XYZW, T19.X, 32, #1
-; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 16, #1
-; CM-NEXT: ALU clause starting at 22:
+; CM-NEXT: Fetch clause starting at 22:
+; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; CM-NEXT: Fetch clause starting at 24:
+; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; CM-NEXT: Fetch clause starting at 26:
+; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; CM-NEXT: ALU clause starting at 28:
; CM-NEXT: MOV * T19.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 23:
+; CM-NEXT: ALU clause starting at 29:
+; CM-NEXT: MOV * T16.X, T20.Z,
+; CM-NEXT: MOV * T17.X, T20.W,
+; CM-NEXT: MOV * T14.X, T20.X,
+; CM-NEXT: MOV * T15.X, T20.Y,
+; CM-NEXT: ALU clause starting at 33:
+; CM-NEXT: MOV * T12.X, T20.Z,
+; CM-NEXT: MOV * T13.X, T20.W,
+; CM-NEXT: MOV * T10.X, T20.X,
+; CM-NEXT: MOV * T11.X, T20.Y,
+; CM-NEXT: ALU clause starting at 37:
+; CM-NEXT: MOV * T8.X, T20.Z,
+; CM-NEXT: MOV * T9.X, T20.W,
+; CM-NEXT: MOV * T6.X, T20.X,
+; CM-NEXT: MOV * T7.X, T20.Y,
+; CM-NEXT: ALU clause starting at 41:
+; CM-NEXT: MOV * T4.X, T19.Z,
+; CM-NEXT: MOV * T5.X, T19.W,
+; CM-NEXT: MOV * T2.X, T19.X,
+; CM-NEXT: MOV T3.X, T19.Y,
+; CM-NEXT: MOV T0.Y, T17.X,
+; CM-NEXT: MOV T0.Z, T16.X, BS:VEC_120/SCL_212
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 96(1.345247e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T21.X, PV.W, literal.x,
-; CM-NEXT: LSHR T0.Y, T20.Z, literal.y,
-; CM-NEXT: LSHR T0.Z, T20.W, literal.y,
+; CM-NEXT: LSHR T19.X, PV.W, literal.x,
+; CM-NEXT: LSHR T1.Y, PV.Z, literal.y,
+; CM-NEXT: LSHR T1.Z, PV.Y, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
-; CM-NEXT: ALU clause starting at 31:
-; CM-NEXT: LSHR T24.X, T0.W, literal.x,
-; CM-NEXT: LSHR T1.Y, T20.Y, literal.y,
-; CM-NEXT: LSHR T1.Z, T19.Z, literal.y,
+; CM-NEXT: MOV * T1.W, T14.X,
+; CM-NEXT: MOV T2.Y, T5.X,
+; CM-NEXT: MOV T2.Z, T2.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T2.W, T4.X, BS:VEC_201
+; CM-NEXT: MOV T3.Y, T3.X,
+; CM-NEXT: MOV T3.Z, T7.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T3.W, T6.X, BS:VEC_201
+; CM-NEXT: MOV T4.Y, T9.X,
+; CM-NEXT: MOV T4.Z, T8.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T4.W, T11.X, BS:VEC_201
+; CM-NEXT: MOV T5.Y, T10.X,
+; CM-NEXT: MOV T5.Z, T13.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T5.W, T12.X, BS:VEC_201
+; CM-NEXT: MOV * T6.Y, T15.X,
+; CM-NEXT: LSHR T20.X, T0.W, literal.x,
+; CM-NEXT: LSHR T7.Y, PV.Y, literal.y,
+; CM-NEXT: LSHR T6.Z, T5.W, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 64(8.968310e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T25.X, PV.W, literal.x,
-; CM-NEXT: LSHR T2.Y, T19.W, literal.y,
-; CM-NEXT: LSHR T2.Z, T19.X, literal.y,
+; CM-NEXT: LSHR T21.X, PV.W, literal.x,
+; CM-NEXT: LSHR T8.Y, T5.Z, literal.y,
+; CM-NEXT: LSHR T7.Z, T5.Y, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 80(1.121039e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T26.X, PV.W, literal.x,
-; CM-NEXT: LSHR T3.Y, T19.Y, literal.y,
-; CM-NEXT: LSHR T3.Z, T23.Z, literal.y,
+; CM-NEXT: LSHR T22.X, PV.W, literal.x,
+; CM-NEXT: LSHR T9.Y, T4.W, literal.y,
+; CM-NEXT: LSHR T8.Z, T4.Z, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T27.X, PV.W, literal.x,
-; CM-NEXT: LSHR T4.Y, T23.W, literal.y,
-; CM-NEXT: LSHR T4.Z, T23.X, literal.y,
+; CM-NEXT: LSHR T23.X, PV.W, literal.x,
+; CM-NEXT: LSHR T10.Y, T4.Y, literal.y,
+; CM-NEXT: LSHR T9.Z, T3.W, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T28.X, PV.W, literal.x,
-; CM-NEXT: LSHR T5.Y, T23.Y, literal.y,
-; CM-NEXT: BFE_INT T29.Z, T22.Y, 0.0, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT: LSHR * T0.W, T22.Z, literal.y,
+; CM-NEXT: LSHR T24.X, PV.W, literal.x,
+; CM-NEXT: LSHR T11.Y, T3.Z, literal.y,
+; CM-NEXT: BFE_INT T25.Z, T3.Y, 0.0, literal.y,
+; CM-NEXT: LSHR * T0.W, T2.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: BFE_INT T29.X, T22.X, 0.0, literal.x,
-; CM-NEXT: LSHR T6.Y, T22.W, literal.x,
-; CM-NEXT: BFE_INT T30.Z, T22.W, 0.0, literal.x,
-; CM-NEXT: LSHR * T1.W, T22.Y, literal.x,
+; CM-NEXT: BFE_INT T25.X, T2.Z, 0.0, literal.x,
+; CM-NEXT: LSHR T12.Y, T2.Y, literal.x,
+; CM-NEXT: BFE_INT T26.Z, T2.Y, 0.0, literal.x,
+; CM-NEXT: LSHR * T6.W, T3.Y, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: BFE_INT T26.X, T2.W, 0.0, literal.x,
+; CM-NEXT: LSHR T2.Y, T2.Z, literal.x,
+; CM-NEXT: BFE_INT T27.Z, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT * T25.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T30.X, T22.Z, 0.0, literal.x,
-; CM-NEXT: LSHR T7.Y, T22.X, literal.x,
-; CM-NEXT: BFE_INT T22.Z, T23.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T29.W, PV.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T27.X, T3.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T25.Y, PV.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T28.Z, T4.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T26.W, T12.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T22.X, T23.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T29.Y, PV.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT T31.Z, T23.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T30.W, T6.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T28.X, T4.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T26.Y, T0.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T29.Z, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT * T27.W, T11.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T31.X, T23.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T30.Y, T0.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT T23.Z, T19.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T22.W, T5.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T29.X, T5.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T27.Y, T9.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T30.Z, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT * T28.W, T10.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T23.X, T19.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T22.Y, T4.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T32.Z, T19.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T31.W, T4.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T30.X, T5.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T28.Y, T8.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T31.Z, T6.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T32.X, T19.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T31.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT: BFE_INT T19.Z, T20.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T23.W, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: ALU clause starting at 127:
+; CM-NEXT: BFE_INT * T29.W, T9.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T19.X, T20.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T23.Y, T2.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T33.Z, T20.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T32.W, T2.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T31.X, T1.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T29.Y, T7.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T32.Z, T0.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T30.W, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T33.X, T20.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T32.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT: LSHR T1.Z, T20.X, literal.x,
-; CM-NEXT: BFE_INT * T19.W, T1.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T32.X, T0.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T30.Y, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: LSHR T0.Z, T1.W, literal.x,
+; CM-NEXT: BFE_INT * T31.W, T7.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T20.X, KC0[2].Y, literal.x,
-; CM-NEXT: BFE_INT T19.Y, PV.Z, 0.0, literal.y,
-; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y,
-; CM-NEXT: BFE_INT * T33.W, T0.Z, 0.0, literal.y,
+; CM-NEXT: LSHR T33.X, KC0[2].Y, literal.x,
+; CM-NEXT: BFE_INT T31.Y, PV.Z, 0.0, literal.y,
+; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
+; CM-NEXT: BFE_INT * T32.W, T1.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR T34.X, PV.Z, literal.x,
-; CM-NEXT: BFE_INT * T33.Y, T0.Y, 0.0, literal.y,
+; CM-NEXT: BFE_INT * T32.Y, T1.Y, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <32 x i16>, ptr addrspace(1) %in
%ext = sext <32 x i16> %load to <32 x i32>
@@ -3901,181 +4217,273 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
;
; EG-LABEL: global_zextload_v64i16_to_v64i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @38, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 3 @22
-; EG-NEXT: ALU 56, @39, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 3 @30
-; EG-NEXT: ALU 87, @96, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T66.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T65.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T64.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T62.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T51.XYZW, T61.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T59.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T58.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T56.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T55.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T53.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T48.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T46.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T44.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T41.X, 1
+; EG-NEXT: ALU 0, @52, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @36
+; EG-NEXT: ALU 3, @53, KC0[], KC1[]
+; EG-NEXT: TEX 0 @38
+; EG-NEXT: ALU 3, @57, KC0[], KC1[]
+; EG-NEXT: TEX 0 @40
+; EG-NEXT: ALU 3, @61, KC0[], KC1[]
+; EG-NEXT: TEX 0 @42
+; EG-NEXT: ALU 3, @65, KC0[], KC1[]
+; EG-NEXT: TEX 0 @44
+; EG-NEXT: ALU 3, @69, KC0[], KC1[]
+; EG-NEXT: TEX 0 @46
+; EG-NEXT: ALU 3, @73, KC0[], KC1[]
+; EG-NEXT: TEX 0 @48
+; EG-NEXT: ALU 3, @77, KC0[], KC1[]
+; EG-NEXT: TEX 0 @50
+; EG-NEXT: ALU 97, @81, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 83, @179, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T66.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T65.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T63.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T61.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T59.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T57.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T55.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T53.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T51.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T49.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T47.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T45.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T43.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T41.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T39.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T37.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T36.XYZW, T37.X, 0, #1
-; EG-NEXT: VTX_READ_128 T38.XYZW, T37.X, 48, #1
-; EG-NEXT: VTX_READ_128 T39.XYZW, T37.X, 32, #1
-; EG-NEXT: VTX_READ_128 T40.XYZW, T37.X, 16, #1
-; EG-NEXT: Fetch clause starting at 30:
-; EG-NEXT: VTX_READ_128 T49.XYZW, T37.X, 112, #1
-; EG-NEXT: VTX_READ_128 T50.XYZW, T37.X, 96, #1
-; EG-NEXT: VTX_READ_128 T51.XYZW, T37.X, 80, #1
-; EG-NEXT: VTX_READ_128 T52.XYZW, T37.X, 64, #1
-; EG-NEXT: ALU clause starting at 38:
-; EG-NEXT: MOV * T37.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 39:
-; EG-NEXT: LSHR * T35.W, T36.W, literal.x,
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 36:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1
+; EG-NEXT: Fetch clause starting at 38:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1
+; EG-NEXT: Fetch clause starting at 40:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 32, #1
+; EG-NEXT: Fetch clause starting at 42:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 48, #1
+; EG-NEXT: Fetch clause starting at 44:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 64, #1
+; EG-NEXT: Fetch clause starting at 46:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 80, #1
+; EG-NEXT: Fetch clause starting at 48:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 96, #1
+; EG-NEXT: Fetch clause starting at 50:
+; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 112, #1
+; EG-NEXT: ALU clause starting at 52:
+; EG-NEXT: MOV * T35.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 53:
+; EG-NEXT: MOV T32.X, T36.Z,
+; EG-NEXT: MOV * T33.X, T36.W,
+; EG-NEXT: MOV T30.X, T36.X,
+; EG-NEXT: MOV * T31.X, T36.Y,
+; EG-NEXT: ALU clause starting at 57:
+; EG-NEXT: MOV T28.X, T36.Z,
+; EG-NEXT: MOV * T29.X, T36.W,
+; EG-NEXT: MOV T26.X, T36.X,
+; EG-NEXT: MOV * T27.X, T36.Y,
+; EG-NEXT: ALU clause starting at 61:
+; EG-NEXT: MOV T24.X, T36.Z,
+; EG-NEXT: MOV * T25.X, T36.W,
+; EG-NEXT: MOV T22.X, T36.X,
+; EG-NEXT: MOV * T23.X, T36.Y,
+; EG-NEXT: ALU clause starting at 65:
+; EG-NEXT: MOV T20.X, T36.Z,
+; EG-NEXT: MOV * T21.X, T36.W,
+; EG-NEXT: MOV T18.X, T36.X,
+; EG-NEXT: MOV * T19.X, T36.Y,
+; EG-NEXT: ALU clause starting at 69:
+; EG-NEXT: MOV T16.X, T36.Z,
+; EG-NEXT: MOV * T17.X, T36.W,
+; EG-NEXT: MOV T14.X, T36.X,
+; EG-NEXT: MOV * T15.X, T36.Y,
+; EG-NEXT: ALU clause starting at 73:
+; EG-NEXT: MOV T12.X, T36.Z,
+; EG-NEXT: MOV * T13.X, T36.W,
+; EG-NEXT: MOV T10.X, T36.X,
+; EG-NEXT: MOV * T11.X, T36.Y,
+; EG-NEXT: ALU clause starting at 77:
+; EG-NEXT: MOV T8.X, T36.Z,
+; EG-NEXT: MOV * T9.X, T36.W,
+; EG-NEXT: MOV T6.X, T36.X,
+; EG-NEXT: MOV * T7.X, T36.Y,
+; EG-NEXT: ALU clause starting at 81:
+; EG-NEXT: MOV T4.X, T35.Z,
+; EG-NEXT: MOV * T5.X, T35.W,
+; EG-NEXT: MOV T2.X, T35.X,
+; EG-NEXT: MOV * T3.X, T35.Y,
+; EG-NEXT: MOV T0.Y, T21.X,
+; EG-NEXT: MOV T0.Z, T22.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T23.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T24.X,
+; EG-NEXT: MOV T1.Z, T25.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T26.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T27.X,
+; EG-NEXT: MOV T2.Z, T28.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T29.X, BS:VEC_201
+; EG-NEXT: MOV T3.Y, T30.X,
+; EG-NEXT: MOV T3.Z, T31.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T32.X, BS:VEC_201
+; EG-NEXT: MOV * T4.Y, T33.X,
+; EG-NEXT: LSHR * T35.W, PV.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T35.Z, T36.W, literal.x,
+; EG-NEXT: AND_INT * T35.Z, T4.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T35.Y, T36.Z, literal.x,
-; EG-NEXT: LSHR * T36.W, T36.Y, literal.x,
+; EG-NEXT: LSHR T35.Y, T3.W, literal.x,
+; EG-NEXT: LSHR * T36.W, T3.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T35.X, T36.Z, literal.x,
-; EG-NEXT: AND_INT T36.Z, T36.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T35.X, T3.W, literal.x,
+; EG-NEXT: AND_INT T36.Z, T3.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T41.X, PV.W, literal.x,
-; EG-NEXT: LSHR T36.Y, T36.X, literal.y,
-; EG-NEXT: LSHR T42.W, T40.W, literal.y,
-; EG-NEXT: AND_INT * T36.X, T36.X, literal.z,
+; EG-NEXT: LSHR T37.X, PV.W, literal.x,
+; EG-NEXT: LSHR T36.Y, T3.Y, literal.y,
+; EG-NEXT: LSHR T38.W, T2.W, literal.y,
+; EG-NEXT: AND_INT * T36.X, T3.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T42.Z, T40.W, literal.x,
+; EG-NEXT: AND_INT * T38.Z, T2.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T43.X, KC0[2].Y, literal.x,
-; EG-NEXT: LSHR T42.Y, T40.Z, literal.y,
-; EG-NEXT: LSHR T40.W, T40.Y, literal.y,
-; EG-NEXT: AND_INT * T42.X, T40.Z, literal.z,
+; EG-NEXT: LSHR T39.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T38.Y, T2.Z, literal.y,
+; EG-NEXT: LSHR T40.W, T2.Y, literal.y,
+; EG-NEXT: AND_INT * T38.X, T2.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T40.Z, T40.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T40.Z, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
-; EG-NEXT: LSHR T44.X, PV.W, literal.x,
-; EG-NEXT: LSHR T40.Y, T40.X, literal.y,
-; EG-NEXT: LSHR T45.W, T39.W, literal.y,
-; EG-NEXT: AND_INT * T40.X, T40.X, literal.z,
+; EG-NEXT: LSHR T41.X, PV.W, literal.x,
+; EG-NEXT: LSHR T40.Y, T1.W, literal.y,
+; EG-NEXT: LSHR T42.W, T1.Z, literal.y,
+; EG-NEXT: AND_INT * T40.X, T1.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T45.Z, T39.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T42.Z, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT: LSHR T46.X, PV.W, literal.x,
-; EG-NEXT: LSHR T45.Y, T39.Z, literal.y,
-; EG-NEXT: LSHR T39.W, T39.Y, literal.y,
-; EG-NEXT: AND_INT * T45.X, T39.Z, literal.z,
+; EG-NEXT: LSHR T43.X, PV.W, literal.x,
+; EG-NEXT: LSHR T42.Y, T1.Y, literal.y,
+; EG-NEXT: LSHR T44.W, T0.W, literal.y,
+; EG-NEXT: AND_INT * T42.X, T1.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T39.Z, T39.Y, literal.x,
+; EG-NEXT: AND_INT T44.Z, T0.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
-; EG-NEXT: LSHR T47.X, PV.W, literal.x,
-; EG-NEXT: LSHR T39.Y, T39.X, literal.y,
-; EG-NEXT: AND_INT * T39.X, T39.X, literal.z,
+; EG-NEXT: LSHR T45.X, PV.W, literal.x,
+; EG-NEXT: LSHR T44.Y, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T44.X, T0.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: LSHR * T37.W, T38.W, literal.y,
+; EG-NEXT: LSHR * T46.W, T0.Y, literal.y,
; EG-NEXT: 64(8.968310e-44), 16(2.242078e-44)
-; EG-NEXT: LSHR T48.X, PV.W, literal.x,
-; EG-NEXT: AND_INT * T37.Z, T38.W, literal.y,
+; EG-NEXT: LSHR T47.X, PV.W, literal.x,
+; EG-NEXT: AND_INT * T46.Z, T0.Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT: ALU clause starting at 96:
-; EG-NEXT: LSHR T37.Y, T38.Z, literal.x,
-; EG-NEXT: LSHR * T38.W, T38.Y, literal.x,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: MOV T0.Z, T3.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T4.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T5.X,
+; EG-NEXT: MOV T1.Z, T6.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T7.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T8.X,
+; EG-NEXT: MOV T2.Z, T9.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T10.X, BS:VEC_201
+; EG-NEXT: MOV T3.Y, T11.X,
+; EG-NEXT: MOV T3.Z, T12.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T13.X, BS:VEC_201
+; EG-NEXT: MOV T4.Y, T14.X,
+; EG-NEXT: MOV T4.Z, T15.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T4.W, T16.X, BS:VEC_201
+; EG-NEXT: MOV T5.Y, T17.X,
+; EG-NEXT: MOV T5.Z, T18.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T5.W, T19.X, BS:VEC_201
+; EG-NEXT: MOV * T6.Y, T20.X,
+; EG-NEXT: LSHR T46.Y, PV.Y, literal.x,
+; EG-NEXT: LSHR * T48.W, T5.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T37.X, T38.Z, literal.x,
-; EG-NEXT: AND_INT T38.Z, T38.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT * T46.X, T6.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 179:
+; EG-NEXT: AND_INT T48.Z, T5.W, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
-; EG-NEXT: LSHR T53.X, PV.W, literal.x,
-; EG-NEXT: LSHR T38.Y, T38.X, literal.y,
-; EG-NEXT: LSHR T54.W, T52.W, literal.y,
-; EG-NEXT: AND_INT * T38.X, T38.X, literal.z,
+; EG-NEXT: LSHR T49.X, PV.W, literal.x,
+; EG-NEXT: LSHR T48.Y, T5.Z, literal.y,
+; EG-NEXT: LSHR T50.W, T5.Y, literal.y,
+; EG-NEXT: AND_INT * T48.X, T5.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T54.Z, T52.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T50.Z, T5.Y, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
-; EG-NEXT: LSHR T55.X, PV.W, literal.x,
-; EG-NEXT: LSHR T54.Y, T52.Z, literal.y,
-; EG-NEXT: LSHR T52.W, T52.Y, literal.y,
-; EG-NEXT: AND_INT * T54.X, T52.Z, literal.z,
+; EG-NEXT: LSHR T51.X, PV.W, literal.x,
+; EG-NEXT: LSHR T50.Y, T4.W, literal.y,
+; EG-NEXT: LSHR T52.W, T4.Z, literal.y,
+; EG-NEXT: AND_INT * T50.X, T4.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T52.Z, T52.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T52.Z, T4.Z, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43)
-; EG-NEXT: LSHR T56.X, PV.W, literal.x,
-; EG-NEXT: LSHR T52.Y, T52.X, literal.y,
-; EG-NEXT: LSHR T57.W, T51.W, literal.y,
-; EG-NEXT: AND_INT * T52.X, T52.X, literal.z,
+; EG-NEXT: LSHR T53.X, PV.W, literal.x,
+; EG-NEXT: LSHR T52.Y, T4.Y, literal.y,
+; EG-NEXT: LSHR T54.W, T3.W, literal.y,
+; EG-NEXT: AND_INT * T52.X, T4.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T57.Z, T51.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T54.Z, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43)
-; EG-NEXT: LSHR T58.X, PV.W, literal.x,
-; EG-NEXT: LSHR T57.Y, T51.Z, literal.y,
-; EG-NEXT: LSHR T51.W, T51.Y, literal.y,
-; EG-NEXT: AND_INT * T57.X, T51.Z, literal.z,
+; EG-NEXT: LSHR T55.X, PV.W, literal.x,
+; EG-NEXT: LSHR T54.Y, T3.Z, literal.y,
+; EG-NEXT: LSHR T56.W, T3.Y, literal.y,
+; EG-NEXT: AND_INT * T54.X, T3.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T51.Z, T51.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T56.Z, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43)
-; EG-NEXT: LSHR T59.X, PV.W, literal.x,
-; EG-NEXT: LSHR T51.Y, T51.X, literal.y,
-; EG-NEXT: LSHR T60.W, T50.W, literal.y,
-; EG-NEXT: AND_INT * T51.X, T51.X, literal.z,
+; EG-NEXT: LSHR T57.X, PV.W, literal.x,
+; EG-NEXT: LSHR T56.Y, T2.W, literal.y,
+; EG-NEXT: LSHR T58.W, T2.Z, literal.y,
+; EG-NEXT: AND_INT * T56.X, T2.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T60.Z, T50.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T58.Z, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43)
-; EG-NEXT: LSHR T61.X, PV.W, literal.x,
-; EG-NEXT: LSHR T60.Y, T50.Z, literal.y,
-; EG-NEXT: LSHR T50.W, T50.Y, literal.y,
-; EG-NEXT: AND_INT * T60.X, T50.Z, literal.z,
+; EG-NEXT: LSHR T59.X, PV.W, literal.x,
+; EG-NEXT: LSHR T58.Y, T2.Y, literal.y,
+; EG-NEXT: LSHR T60.W, T1.W, literal.y,
+; EG-NEXT: AND_INT * T58.X, T2.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T50.Z, T50.Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T60.Z, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43)
-; EG-NEXT: LSHR T62.X, PV.W, literal.x,
-; EG-NEXT: LSHR T50.Y, T50.X, literal.y,
-; EG-NEXT: LSHR T63.W, T49.W, literal.y,
-; EG-NEXT: AND_INT * T50.X, T50.X, literal.z,
+; EG-NEXT: LSHR T61.X, PV.W, literal.x,
+; EG-NEXT: LSHR T60.Y, T1.Z, literal.y,
+; EG-NEXT: LSHR T62.W, T1.Y, literal.y,
+; EG-NEXT: AND_INT * T60.X, T1.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T63.Z, T49.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T62.Z, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43)
-; EG-NEXT: LSHR T64.X, PV.W, literal.x,
-; EG-NEXT: LSHR T63.Y, T49.Z, literal.y,
-; EG-NEXT: LSHR T49.W, T49.Y, literal.y,
-; EG-NEXT: AND_INT * T63.X, T49.Z, literal.z,
+; EG-NEXT: LSHR T63.X, PV.W, literal.x,
+; EG-NEXT: LSHR T62.Y, T0.W, literal.y,
+; EG-NEXT: LSHR T64.W, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T62.X, T0.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T49.Z, T49.Y, literal.x,
+; EG-NEXT: AND_INT T64.Z, T0.Z, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 240(3.363116e-43)
; EG-NEXT: LSHR T65.X, PV.W, literal.x,
-; EG-NEXT: LSHR T49.Y, T49.X, literal.y,
-; EG-NEXT: AND_INT * T49.X, T49.X, literal.z,
+; EG-NEXT: LSHR T64.Y, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T64.X, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
@@ -4085,168 +4493,259 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
;
; CM-LABEL: global_zextload_v64i16_to_v64i32:
; CM: ; %bb.0:
-; CM-NEXT: ALU 0, @38, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 3 @22
-; CM-NEXT: ALU 50, @39, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 3 @30
-; CM-NEXT: ALU 78, @90, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T65, T66.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T63, T48.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T62, T64.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T60, T49.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T59, T61.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T57, T50.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T56, T58.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T54, T51.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T53, T55.X
+; CM-NEXT: ALU 0, @52, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 0 @36
+; CM-NEXT: ALU 3, @53, KC0[], KC1[]
+; CM-NEXT: TEX 0 @38
+; CM-NEXT: ALU 3, @57, KC0[], KC1[]
+; CM-NEXT: TEX 0 @40
+; CM-NEXT: ALU 3, @61, KC0[], KC1[]
+; CM-NEXT: TEX 0 @42
+; CM-NEXT: ALU 3, @65, KC0[], KC1[]
+; CM-NEXT: TEX 0 @44
+; CM-NEXT: ALU 3, @69, KC0[], KC1[]
+; CM-NEXT: TEX 0 @46
+; CM-NEXT: ALU 3, @73, KC0[], KC1[]
+; CM-NEXT: TEX 0 @48
+; CM-NEXT: ALU 3, @77, KC0[], KC1[]
+; CM-NEXT: TEX 0 @50
+; CM-NEXT: ALU 90, @81, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 74, @172, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T64, T66.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T62, T65.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T60, T63.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T58, T61.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T56, T59.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T54, T57.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T52, T55.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T50, T53.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T48, T51.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T46, T49.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T44, T47.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T42, T45.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T40, T43.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T38, T41.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T39.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T37.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T47, T52.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T38.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T44, T46.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T42, T39.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T41, T43.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T40, T36.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 22:
-; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 112, #1
-; CM-NEXT: VTX_READ_128 T37.XYZW, T35.X, 64, #1
-; CM-NEXT: VTX_READ_128 T38.XYZW, T35.X, 80, #1
-; CM-NEXT: VTX_READ_128 T39.XYZW, T35.X, 96, #1
-; CM-NEXT: Fetch clause starting at 30:
-; CM-NEXT: VTX_READ_128 T48.XYZW, T35.X, 0, #1
-; CM-NEXT: VTX_READ_128 T49.XYZW, T35.X, 16, #1
-; CM-NEXT: VTX_READ_128 T50.XYZW, T35.X, 32, #1
-; CM-NEXT: VTX_READ_128 T51.XYZW, T35.X, 48, #1
-; CM-NEXT: ALU clause starting at 38:
+; CM-NEXT: PAD
+; CM-NEXT: Fetch clause starting at 36:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1
+; CM-NEXT: Fetch clause starting at 38:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1
+; CM-NEXT: Fetch clause starting at 40:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 32, #1
+; CM-NEXT: Fetch clause starting at 42:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 48, #1
+; CM-NEXT: Fetch clause starting at 44:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 64, #1
+; CM-NEXT: Fetch clause starting at 46:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 80, #1
+; CM-NEXT: Fetch clause starting at 48:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 96, #1
+; CM-NEXT: Fetch clause starting at 50:
+; CM-NEXT: VTX_READ_128 T35.XYZW, T35.X, 112, #1
+; CM-NEXT: ALU clause starting at 52:
; CM-NEXT: MOV * T35.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 39:
-; CM-NEXT: LSHR * T40.W, T36.Y, literal.x,
+; CM-NEXT: ALU clause starting at 53:
+; CM-NEXT: MOV * T32.X, T36.Z,
+; CM-NEXT: MOV * T33.X, T36.W,
+; CM-NEXT: MOV * T30.X, T36.X,
+; CM-NEXT: MOV * T31.X, T36.Y,
+; CM-NEXT: ALU clause starting at 57:
+; CM-NEXT: MOV * T28.X, T36.Z,
+; CM-NEXT: MOV * T29.X, T36.W,
+; CM-NEXT: MOV * T26.X, T36.X,
+; CM-NEXT: MOV * T27.X, T36.Y,
+; CM-NEXT: ALU clause starting at 61:
+; CM-NEXT: MOV * T24.X, T36.Z,
+; CM-NEXT: MOV * T25.X, T36.W,
+; CM-NEXT: MOV * T22.X, T36.X,
+; CM-NEXT: MOV * T23.X, T36.Y,
+; CM-NEXT: ALU clause starting at 65:
+; CM-NEXT: MOV * T20.X, T36.Z,
+; CM-NEXT: MOV * T21.X, T36.W,
+; CM-NEXT: MOV * T18.X, T36.X,
+; CM-NEXT: MOV * T19.X, T36.Y,
+; CM-NEXT: ALU clause starting at 69:
+; CM-NEXT: MOV * T16.X, T36.Z,
+; CM-NEXT: MOV * T17.X, T36.W,
+; CM-NEXT: MOV * T14.X, T36.X,
+; CM-NEXT: MOV * T15.X, T36.Y,
+; CM-NEXT: ALU clause starting at 73:
+; CM-NEXT: MOV * T12.X, T36.Z,
+; CM-NEXT: MOV * T13.X, T36.W,
+; CM-NEXT: MOV * T10.X, T36.X,
+; CM-NEXT: MOV * T11.X, T36.Y,
+; CM-NEXT: ALU clause starting at 77:
+; CM-NEXT: MOV * T8.X, T36.Z,
+; CM-NEXT: MOV * T9.X, T36.W,
+; CM-NEXT: MOV * T6.X, T36.X,
+; CM-NEXT: MOV * T7.X, T36.Y,
+; CM-NEXT: ALU clause starting at 81:
+; CM-NEXT: MOV * T4.X, T35.Z,
+; CM-NEXT: MOV * T5.X, T35.W,
+; CM-NEXT: MOV * T2.X, T35.X,
+; CM-NEXT: MOV T3.X, T35.Y,
+; CM-NEXT: MOV T0.Y, T15.X,
+; CM-NEXT: MOV T0.Z, T12.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T0.W, T13.X, BS:VEC_201
+; CM-NEXT: MOV T1.Y, T10.X,
+; CM-NEXT: MOV T1.Z, T11.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T1.W, T8.X, BS:VEC_201
+; CM-NEXT: MOV T2.Y, T9.X,
+; CM-NEXT: MOV T2.Z, T6.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T2.W, T7.X, BS:VEC_201
+; CM-NEXT: MOV T3.Y, T4.X,
+; CM-NEXT: MOV T3.Z, T5.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T3.W, T2.X, BS:VEC_201
+; CM-NEXT: MOV * T4.Y, T3.X,
+; CM-NEXT: LSHR * T35.W, PV.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT * T40.Z, T36.Y, literal.x,
+; CM-NEXT: AND_INT * T35.Z, T4.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: LSHR T40.Y, T36.X, literal.x,
-; CM-NEXT: LSHR * T41.W, T36.W, literal.x,
+; CM-NEXT: LSHR T35.Y, T3.W, literal.x,
+; CM-NEXT: LSHR * T36.W, T3.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T40.X, T36.X, literal.x,
-; CM-NEXT: AND_INT T41.Z, T36.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T35.X, T3.W, literal.x,
+; CM-NEXT: AND_INT T36.Z, T3.Z, literal.x,
+; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 224(3.138909e-43)
-; CM-NEXT: LSHR T36.X, PV.W, literal.x,
-; CM-NEXT: LSHR T41.Y, T36.Z, literal.y,
-; CM-NEXT: LSHR * T42.W, T39.Y, literal.y,
+; CM-NEXT: LSHR T37.X, PV.W, literal.x,
+; CM-NEXT: LSHR T36.Y, T3.Y, literal.y,
+; CM-NEXT: LSHR * T38.W, T2.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T41.X, T36.Z, literal.x,
-; CM-NEXT: AND_INT T42.Z, T39.Y, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T36.X, T3.Y, literal.x,
+; CM-NEXT: AND_INT T38.Z, T2.W, literal.x,
+; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 240(3.363116e-43)
-; CM-NEXT: LSHR T43.X, PV.W, literal.x,
-; CM-NEXT: LSHR T42.Y, T39.X, literal.y,
-; CM-NEXT: LSHR * T44.W, T39.W, literal.y,
+; CM-NEXT: LSHR T39.X, PV.W, literal.x,
+; CM-NEXT: LSHR T38.Y, T2.Z, literal.y,
+; CM-NEXT: LSHR * T40.W, T2.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T42.X, T39.X, literal.x,
-; CM-NEXT: AND_INT T44.Z, T39.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T38.X, T2.Z, literal.x,
+; CM-NEXT: AND_INT T40.Z, T2.Y, literal.x,
+; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 192(2.690493e-43)
-; CM-NEXT: LSHR T39.X, PV.W, literal.x,
-; CM-NEXT: LSHR T44.Y, T39.Z, literal.y,
-; CM-NEXT: LSHR * T45.W, T38.Y, literal.y,
+; CM-NEXT: LSHR T41.X, PV.W, literal.x,
+; CM-NEXT: LSHR T40.Y, T1.W, literal.y,
+; CM-NEXT: LSHR * T42.W, T1.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T44.X, T39.Z, literal.x,
-; CM-NEXT: AND_INT T45.Z, T38.Y, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: AND_INT T40.X, T1.W, literal.x,
+; CM-NEXT: AND_INT T42.Z, T1.Z, literal.x,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 208(2.914701e-43)
-; CM-NEXT: LSHR T46.X, PV.W, literal.x,
-; CM-NEXT: LSHR T45.Y, T38.X, literal.y,
-; CM-NEXT: LSHR * T47.W, T38.W, literal.y,
+; CM-NEXT: LSHR T43.X, PV.W, literal.x,
+; CM-NEXT: LSHR T42.Y, T1.Y, literal.y,
+; CM-NEXT: LSHR * T44.W, T0.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T45.X, T38.X, literal.x,
-; CM-NEXT: AND_INT T47.Z, T38.W, literal.x,
+; CM-NEXT: AND_INT T42.X, T1.Y, literal.x,
+; CM-NEXT: AND_INT T44.Z, T0.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 160(2.242078e-43)
-; CM-NEXT: LSHR T38.X, PV.W, literal.x,
-; CM-NEXT: LSHR T47.Y, T38.Z, literal.y,
-; CM-NEXT: LSHR * T35.W, T37.Y, literal.y,
+; CM-NEXT: LSHR T45.X, PV.W, literal.x,
+; CM-NEXT: LSHR T44.Y, T0.Z, literal.y,
+; CM-NEXT: LSHR * T46.W, T0.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T47.X, T38.Z, literal.x,
-; CM-NEXT: AND_INT T35.Z, T37.Y, literal.x,
+; CM-NEXT: AND_INT T44.X, T0.Z, literal.x,
+; CM-NEXT: AND_INT T46.Z, T0.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 176(2.466285e-43)
-; CM-NEXT: ALU clause starting at 90:
-; CM-NEXT: LSHR T52.X, T0.W, literal.x,
-; CM-NEXT: LSHR T35.Y, T37.X, literal.y,
-; CM-NEXT: LSHR * T53.W, T37.W, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT: MOV T0.Y, T32.X,
+; CM-NEXT: MOV * T0.Z, T33.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV T1.Y, T30.X,
+; CM-NEXT: MOV T1.Z, T31.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T1.W, T28.X, BS:VEC_201
+; CM-NEXT: MOV T2.Y, T29.X,
+; CM-NEXT: MOV T2.Z, T26.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T2.W, T27.X, BS:VEC_201
+; CM-NEXT: MOV T3.Y, T24.X,
+; CM-NEXT: MOV T3.Z, T25.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T3.W, T22.X, BS:VEC_201
+; CM-NEXT: MOV T4.Y, T23.X,
+; CM-NEXT: MOV T4.Z, T20.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T4.W, T21.X, BS:VEC_201
+; CM-NEXT: MOV T5.Y, T18.X,
+; CM-NEXT: MOV T5.Z, T19.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T5.W, T16.X, BS:VEC_201
+; CM-NEXT: MOV T6.Y, T17.X,
+; CM-NEXT: MOV * T6.Z, T14.X, BS:VEC_120/SCL_212
+; CM-NEXT: LSHR T47.X, T0.W, literal.x,
+; CM-NEXT: LSHR T46.Y, PV.Z, literal.y,
+; CM-NEXT: LSHR * T48.W, PV.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T35.X, T37.X, literal.x,
-; CM-NEXT: AND_INT T53.Z, T37.W, literal.x,
+; CM-NEXT: ALU clause starting at 172:
+; CM-NEXT: AND_INT T46.X, T6.Z, literal.x,
+; CM-NEXT: AND_INT T48.Z, T6.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 128(1.793662e-43)
-; CM-NEXT: LSHR T37.X, PV.W, literal.x,
-; CM-NEXT: LSHR T53.Y, T37.Z, literal.y,
-; CM-NEXT: LSHR * T54.W, T51.Y, literal.y,
+; CM-NEXT: LSHR T49.X, PV.W, literal.x,
+; CM-NEXT: LSHR T48.Y, T5.W, literal.y,
+; CM-NEXT: LSHR * T50.W, T5.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T53.X, T37.Z, literal.x,
-; CM-NEXT: AND_INT T54.Z, T51.Y, literal.x,
+; CM-NEXT: AND_INT T48.X, T5.W, literal.x,
+; CM-NEXT: AND_INT T50.Z, T5.Z, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 144(2.017870e-43)
-; CM-NEXT: LSHR T55.X, PV.W, literal.x,
-; CM-NEXT: LSHR T54.Y, T51.X, literal.y,
-; CM-NEXT: LSHR * T56.W, T51.W, literal.y,
+; CM-NEXT: LSHR T51.X, PV.W, literal.x,
+; CM-NEXT: LSHR T50.Y, T5.Y, literal.y,
+; CM-NEXT: LSHR * T52.W, T4.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T54.X, T51.X, literal.x,
-; CM-NEXT: AND_INT T56.Z, T51.W, literal.x,
+; CM-NEXT: AND_INT T50.X, T5.Y, literal.x,
+; CM-NEXT: AND_INT T52.Z, T4.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
-; CM-NEXT: LSHR T51.X, PV.W, literal.x,
-; CM-NEXT: LSHR T56.Y, T51.Z, literal.y,
-; CM-NEXT: LSHR * T57.W, T50.Y, literal.y,
+; CM-NEXT: LSHR T53.X, PV.W, literal.x,
+; CM-NEXT: LSHR T52.Y, T4.Z, literal.y,
+; CM-NEXT: LSHR * T54.W, T4.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T56.X, T51.Z, literal.x,
-; CM-NEXT: AND_INT T57.Z, T50.Y, literal.x,
+; CM-NEXT: AND_INT T52.X, T4.Z, literal.x,
+; CM-NEXT: AND_INT T54.Z, T4.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
-; CM-NEXT: LSHR T58.X, PV.W, literal.x,
-; CM-NEXT: LSHR T57.Y, T50.X, literal.y,
-; CM-NEXT: LSHR * T59.W, T50.W, literal.y,
+; CM-NEXT: LSHR T55.X, PV.W, literal.x,
+; CM-NEXT: LSHR T54.Y, T3.W, literal.y,
+; CM-NEXT: LSHR * T56.W, T3.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T57.X, T50.X, literal.x,
-; CM-NEXT: AND_INT T59.Z, T50.W, literal.x,
+; CM-NEXT: AND_INT T54.X, T3.W, literal.x,
+; CM-NEXT: AND_INT T56.Z, T3.Z, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
-; CM-NEXT: LSHR T50.X, PV.W, literal.x,
-; CM-NEXT: LSHR T59.Y, T50.Z, literal.y,
-; CM-NEXT: LSHR * T60.W, T49.Y, literal.y,
+; CM-NEXT: LSHR T57.X, PV.W, literal.x,
+; CM-NEXT: LSHR T56.Y, T3.Y, literal.y,
+; CM-NEXT: LSHR * T58.W, T2.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T59.X, T50.Z, literal.x,
-; CM-NEXT: AND_INT T60.Z, T49.Y, literal.x,
+; CM-NEXT: AND_INT T56.X, T3.Y, literal.x,
+; CM-NEXT: AND_INT T58.Z, T2.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
-; CM-NEXT: LSHR T61.X, PV.W, literal.x,
-; CM-NEXT: LSHR T60.Y, T49.X, literal.y,
-; CM-NEXT: LSHR * T62.W, T49.W, literal.y,
+; CM-NEXT: LSHR T59.X, PV.W, literal.x,
+; CM-NEXT: LSHR T58.Y, T2.Z, literal.y,
+; CM-NEXT: LSHR * T60.W, T2.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T60.X, T49.X, literal.x,
-; CM-NEXT: AND_INT T62.Z, T49.W, literal.x,
+; CM-NEXT: AND_INT T58.X, T2.Z, literal.x,
+; CM-NEXT: AND_INT T60.Z, T2.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
-; CM-NEXT: LSHR T49.X, PV.W, literal.x,
-; CM-NEXT: LSHR T62.Y, T49.Z, literal.y,
-; CM-NEXT: LSHR * T63.W, T48.Y, literal.y,
+; CM-NEXT: LSHR T61.X, PV.W, literal.x,
+; CM-NEXT: LSHR T60.Y, T1.W, literal.y,
+; CM-NEXT: LSHR * T62.W, T1.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T62.X, T49.Z, literal.x,
-; CM-NEXT: AND_INT T63.Z, T48.Y, literal.x,
+; CM-NEXT: AND_INT T60.X, T1.W, literal.x,
+; CM-NEXT: AND_INT T62.Z, T1.Z, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
-; CM-NEXT: LSHR T64.X, PV.W, literal.x,
-; CM-NEXT: LSHR T63.Y, T48.X, literal.y,
-; CM-NEXT: LSHR * T65.W, T48.W, literal.y,
+; CM-NEXT: LSHR T63.X, PV.W, literal.x,
+; CM-NEXT: LSHR T62.Y, T1.Y, literal.y,
+; CM-NEXT: LSHR * T64.W, T0.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T63.X, T48.X, literal.x,
-; CM-NEXT: AND_INT * T65.Z, T48.W, literal.x,
+; CM-NEXT: AND_INT T62.X, T1.Y, literal.x,
+; CM-NEXT: AND_INT * T64.Z, T0.Z, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: LSHR T48.X, KC0[2].Y, literal.x,
-; CM-NEXT: LSHR * T65.Y, T48.Z, literal.y,
+; CM-NEXT: LSHR T65.X, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR * T64.Y, T0.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: AND_INT T65.X, T48.Z, literal.x,
+; CM-NEXT: AND_INT T64.X, T0.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: LSHR * T66.X, PV.W, literal.x,
@@ -4702,420 +5201,605 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
;
; EG-LABEL: global_sextload_v64i16_to_v64i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 18, @38, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 7 @22
-; EG-NEXT: ALU 75, @57, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 71, @133, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T66.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T56.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T55.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T54.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T53.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T52.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T51.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T50.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T49.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T39.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T38.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T37.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T36.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T35.X, 1
+; EG-NEXT: ALU 0, @52, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @36
+; EG-NEXT: ALU 3, @53, KC0[], KC1[]
+; EG-NEXT: TEX 0 @38
+; EG-NEXT: ALU 3, @57, KC0[], KC1[]
+; EG-NEXT: TEX 0 @40
+; EG-NEXT: ALU 3, @61, KC0[], KC1[]
+; EG-NEXT: TEX 0 @42
+; EG-NEXT: ALU 3, @65, KC0[], KC1[]
+; EG-NEXT: TEX 0 @44
+; EG-NEXT: ALU 3, @69, KC0[], KC1[]
+; EG-NEXT: TEX 0 @46
+; EG-NEXT: ALU 3, @73, KC0[], KC1[]
+; EG-NEXT: TEX 0 @48
+; EG-NEXT: ALU 3, @77, KC0[], KC1[]
+; EG-NEXT: TEX 0 @50
+; EG-NEXT: ALU 91, @81, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 74, @173, KC0[], KC1[]
+; EG-NEXT: ALU 36, @248, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T66.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T65.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T48.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T47.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T46.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T45.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T44.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T43.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T42.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T55.XYZW, T35.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T41.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T53.XYZW, T40.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T39.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T51.XYZW, T38.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T37.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T36.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T42.XYZW, T41.X, 16, #1
-; EG-NEXT: VTX_READ_128 T43.XYZW, T41.X, 32, #1
-; EG-NEXT: VTX_READ_128 T44.XYZW, T41.X, 0, #1
-; EG-NEXT: VTX_READ_128 T45.XYZW, T41.X, 48, #1
-; EG-NEXT: VTX_READ_128 T46.XYZW, T41.X, 64, #1
-; EG-NEXT: VTX_READ_128 T47.XYZW, T41.X, 80, #1
-; EG-NEXT: VTX_READ_128 T48.XYZW, T41.X, 96, #1
-; EG-NEXT: VTX_READ_128 T41.XYZW, T41.X, 112, #1
-; EG-NEXT: ALU clause starting at 38:
+; EG-NEXT: Fetch clause starting at 36:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1
+; EG-NEXT: Fetch clause starting at 38:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1
+; EG-NEXT: Fetch clause starting at 40:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 32, #1
+; EG-NEXT: Fetch clause starting at 42:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 48, #1
+; EG-NEXT: Fetch clause starting at 44:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 64, #1
+; EG-NEXT: Fetch clause starting at 46:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 80, #1
+; EG-NEXT: Fetch clause starting at 48:
+; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 96, #1
+; EG-NEXT: Fetch clause starting at 50:
+; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 112, #1
+; EG-NEXT: ALU clause starting at 52:
+; EG-NEXT: MOV * T35.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 53:
+; EG-NEXT: MOV T32.X, T36.Z,
+; EG-NEXT: MOV * T33.X, T36.W,
+; EG-NEXT: MOV T30.X, T36.X,
+; EG-NEXT: MOV * T31.X, T36.Y,
+; EG-NEXT: ALU clause starting at 57:
+; EG-NEXT: MOV T28.X, T36.Z,
+; EG-NEXT: MOV * T29.X, T36.W,
+; EG-NEXT: MOV T26.X, T36.X,
+; EG-NEXT: MOV * T27.X, T36.Y,
+; EG-NEXT: ALU clause starting at 61:
+; EG-NEXT: MOV T24.X, T36.Z,
+; EG-NEXT: MOV * T25.X, T36.W,
+; EG-NEXT: MOV T22.X, T36.X,
+; EG-NEXT: MOV * T23.X, T36.Y,
+; EG-NEXT: ALU clause starting at 65:
+; EG-NEXT: MOV T20.X, T36.Z,
+; EG-NEXT: MOV * T21.X, T36.W,
+; EG-NEXT: MOV T18.X, T36.X,
+; EG-NEXT: MOV * T19.X, T36.Y,
+; EG-NEXT: ALU clause starting at 69:
+; EG-NEXT: MOV T16.X, T36.Z,
+; EG-NEXT: MOV * T17.X, T36.W,
+; EG-NEXT: MOV T14.X, T36.X,
+; EG-NEXT: MOV * T15.X, T36.Y,
+; EG-NEXT: ALU clause starting at 73:
+; EG-NEXT: MOV T12.X, T36.Z,
+; EG-NEXT: MOV * T13.X, T36.W,
+; EG-NEXT: MOV T10.X, T36.X,
+; EG-NEXT: MOV * T11.X, T36.Y,
+; EG-NEXT: ALU clause starting at 77:
+; EG-NEXT: MOV T8.X, T36.Z,
+; EG-NEXT: MOV * T9.X, T36.W,
+; EG-NEXT: MOV T6.X, T36.X,
+; EG-NEXT: MOV * T7.X, T36.Y,
+; EG-NEXT: ALU clause starting at 81:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T35.X, PV.W, literal.x,
-; EG-NEXT: LSHR * T36.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T36.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T37.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T37.X, PV.W, literal.x,
+; EG-NEXT: LSHR T38.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T38.X, PV.W, literal.x,
+; EG-NEXT: LSHR T39.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT: LSHR T39.X, PV.W, literal.x,
+; EG-NEXT: LSHR T40.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR T40.X, PV.W, literal.x,
-; EG-NEXT: MOV * T41.X, KC0[2].Z,
+; EG-NEXT: LSHR T41.X, PV.W, literal.x,
+; EG-NEXT: MOV * T4.X, T35.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 57:
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T5.X, T35.W,
+; EG-NEXT: MOV * T2.X, T35.X,
+; EG-NEXT: MOV T3.X, T35.Y,
+; EG-NEXT: MOV T0.Y, T22.X,
+; EG-NEXT: MOV T0.Z, T23.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T24.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T26.X,
+; EG-NEXT: MOV T1.Z, T28.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T30.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T29.X,
+; EG-NEXT: MOV T2.Z, T31.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T27.X, BS:VEC_201
+; EG-NEXT: MOV T3.Y, T32.X,
+; EG-NEXT: MOV T3.Z, T25.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T21.X, BS:VEC_201
+; EG-NEXT: MOV T4.Y, T33.X,
+; EG-NEXT: MOV T4.Z, T19.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T4.W, T17.X, BS:VEC_201
+; EG-NEXT: MOV T5.Y, T15.X,
+; EG-NEXT: MOV T5.Z, T13.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T5.W, T11.X, BS:VEC_201
+; EG-NEXT: MOV T6.Y, T9.X,
+; EG-NEXT: MOV T6.Z, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T6.W, T5.X, BS:VEC_201
+; EG-NEXT: MOV T7.Y, T3.X,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x,
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T49.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T35.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT: LSHR T50.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T42.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT: LSHR T51.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T43.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT: LSHR T52.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T44.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT: LSHR T53.X, PV.W, literal.x,
-; EG-NEXT: LSHR T0.Y, T41.Y, literal.y,
-; EG-NEXT: LSHR T0.Z, T41.W, literal.y,
-; EG-NEXT: LSHR T0.W, T48.Y, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T45.X, PV.W, literal.x,
+; EG-NEXT: LSHR T7.Z, T7.Y, literal.y,
+; EG-NEXT: LSHR T7.W, T6.W, literal.y,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T54.X, PS, literal.x,
-; EG-NEXT: LSHR T1.Y, T48.W, literal.y,
-; EG-NEXT: LSHR T1.Z, T47.Y, literal.y,
-; EG-NEXT: LSHR T1.W, T47.W, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T46.X, PS, literal.x,
+; EG-NEXT: LSHR T8.Y, T6.Z, literal.y,
+; EG-NEXT: LSHR T8.Z, T6.Y, literal.y,
+; EG-NEXT: LSHR T8.W, T5.W, literal.y,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T55.X, PS, literal.x,
-; EG-NEXT: LSHR T2.Y, T46.Y, literal.y,
-; EG-NEXT: LSHR T2.Z, T46.W, literal.y,
-; EG-NEXT: LSHR T2.W, T45.Y, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT: LSHR T47.X, PS, literal.x,
+; EG-NEXT: LSHR T9.Y, T5.Z, literal.y,
+; EG-NEXT: LSHR T9.Z, T5.Y, literal.y,
+; EG-NEXT: LSHR T9.W, T4.W, literal.y,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T56.X, PS, literal.x,
-; EG-NEXT: LSHR T3.Y, T45.W, literal.y,
-; EG-NEXT: BFE_INT T57.Z, T44.W, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T3.W, T43.Y, literal.y,
-; EG-NEXT: LSHR * T4.W, T43.W, literal.y,
+; EG-NEXT: LSHR T48.X, PS, literal.x,
+; EG-NEXT: LSHR T10.Y, T4.Z, literal.y,
+; EG-NEXT: BFE_INT T49.Z, T4.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T10.W, T3.W, literal.y,
+; EG-NEXT: LSHR * T11.W, T3.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T57.X, T44.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T4.Y, T42.Y, literal.x,
-; EG-NEXT: BFE_INT T58.Z, T44.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T5.W, T42.W, literal.x,
-; EG-NEXT: LSHR * T6.W, T44.W, literal.x,
+; EG-NEXT: BFE_INT T49.X, T3.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T11.Y, T2.W, literal.x,
+; EG-NEXT: BFE_INT T50.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T12.W, T2.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T13.W, T4.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T58.X, T44.X, 0.0, literal.x,
-; EG-NEXT: LSHR T5.Y, T44.Y, literal.x,
-; EG-NEXT: BFE_INT T59.Z, T42.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T57.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T44.Z, literal.x,
+; EG-NEXT: BFE_INT * T50.X, T1.W, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T59.X, T42.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T57.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T44.Z, T42.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T58.W, PV.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T44.X, literal.x,
+; EG-NEXT: ALU clause starting at 173:
+; EG-NEXT: LSHR T4.Y, T2.Z, literal.x,
+; EG-NEXT: BFE_INT T51.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T49.W, T13.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T13.W, T3.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T44.X, T42.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T58.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T60.Z, T43.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T59.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T5.W, T42.Z, literal.x,
+; EG-NEXT: BFE_INT T51.X, T1.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T49.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T52.Z, T2.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T50.W, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T1.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T60.X, T43.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T59.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.Z, T43.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T44.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T5.W, T42.X, literal.x,
+; EG-NEXT: BFE_INT T52.X, T1.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T50.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T53.Z, T3.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T51.W, T12.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T42.X, T43.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T44.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T61.Z, T45.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T60.W, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T53.X, T0.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T51.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T54.Z, T0.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T52.W, T11.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 133:
-; EG-NEXT: LSHR * T4.W, T43.Z, literal.x,
+; EG-NEXT: BFE_INT T54.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T52.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T55.Z, T3.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T53.W, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T1.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T61.X, T45.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T60.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.Z, T45.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.W, T3.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T3.W, T43.X, literal.x,
+; EG-NEXT: MOV * T0.Z, T2.X,
+; EG-NEXT: MOV T1.Y, T4.X,
+; EG-NEXT: MOV T1.Z, T6.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.Y, T8.X,
+; EG-NEXT: MOV T2.Z, T10.X,
+; EG-NEXT: MOV * T2.W, T12.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T3.Y, T14.X,
+; EG-NEXT: MOV T3.Z, T16.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T3.W, T18.X, BS:VEC_201
+; EG-NEXT: MOV T4.Y, T20.X,
+; EG-NEXT: LSHR * T0.W, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T43.X, T45.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T62.Z, T46.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T61.W, T3.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T3.W, T45.Z, literal.x,
+; EG-NEXT: BFE_INT T55.X, PV.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T53.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T56.Z, T4.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T54.W, T1.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T62.X, T46.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T61.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.Z, T46.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.W, T2.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T2.W, T45.X, literal.x,
+; EG-NEXT: BFE_INT T56.X, T3.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T54.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T57.Z, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T55.W, T10.W, 0.0, literal.x, BS:VEC_201
+; EG-NEXT: LSHR * T0.W, T4.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T45.X, T46.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T63.Z, T47.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T62.W, T2.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T2.W, T46.Z, literal.x,
+; EG-NEXT: BFE_INT T57.X, T3.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T55.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T58.Z, T5.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T56.W, T10.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T0.W, T3.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T63.X, T47.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T62.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T46.Z, T47.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T2.W, T46.X, literal.x,
+; EG-NEXT: BFE_INT T58.X, T3.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T56.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T59.Z, T5.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T57.W, T9.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T3.Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T59.X, T2.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T57.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T60.Z, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T58.W, T9.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T3.Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T60.X, T2.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T58.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T61.Z, T6.Y, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T46.X, T47.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T64.Z, T48.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T63.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T1.W, T47.Z, literal.x,
+; EG-NEXT: ALU clause starting at 248:
+; EG-NEXT: BFE_INT T59.W, T9.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T2.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T64.X, T48.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T63.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T47.Z, T48.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T46.W, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T1.W, T47.X, literal.x,
+; EG-NEXT: BFE_INT T61.X, T2.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T59.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T62.Z, T6.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T60.W, T8.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T2.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T47.X, T48.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T46.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T65.Z, T41.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T64.W, T1.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T48.Z, literal.x,
+; EG-NEXT: BFE_INT T62.X, T1.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T60.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T63.Z, T6.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T61.W, T8.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T0.W, T2.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T65.X, T41.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T64.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T48.Z, T41.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T47.W, T0.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T48.X, literal.x,
+; EG-NEXT: BFE_INT T63.X, T1.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T61.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T64.Z, T7.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T62.W, T8.Y, 0.0, literal.x, BS:VEC_201
+; EG-NEXT: LSHR * T0.W, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T48.X, T41.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T47.Y, PS, 0.0, literal.x,
-; EG-NEXT: LSHR T1.Z, T41.Z, literal.x,
-; EG-NEXT: BFE_INT T65.W, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T64.X, T0.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T62.Y, PS, 0.0, literal.x,
+; EG-NEXT: LSHR T1.Z, T1.Y, literal.x,
+; EG-NEXT: BFE_INT T63.W, T7.W, 0.0, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 240(3.363116e-43)
-; EG-NEXT: LSHR T66.X, PS, literal.x,
-; EG-NEXT: BFE_INT T65.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT: LSHR T0.Z, T41.X, literal.y,
-; EG-NEXT: BFE_INT T48.W, T0.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T65.X, PS, literal.x,
+; EG-NEXT: BFE_INT T63.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T0.Z, T0.Z, literal.y,
+; EG-NEXT: BFE_INT T64.W, T7.Z, 0.0, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T41.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T48.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T66.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T64.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v64i16_to_v64i32:
; CM: ; %bb.0:
-; CM-NEXT: ALU 0, @40, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 1 @24
-; CM-NEXT: ALU 15, @41, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 5 @28
-; CM-NEXT: ALU 82, @57, KC0[CB0:0-32], KC1[]
-; CM-NEXT: ALU 72, @140, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T65, T66.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T35.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T64, T56.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T55.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T63, T54.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T53.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T62, T52.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T44, T51.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T61, T50.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T43, T49.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T60, T48.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T42, T47.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T59, T46.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T41, T40.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T58, T39.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T57, T38.X
+; CM-NEXT: ALU 0, @52, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 0 @36
+; CM-NEXT: ALU 3, @53, KC0[], KC1[]
+; CM-NEXT: TEX 0 @38
+; CM-NEXT: ALU 3, @57, KC0[], KC1[]
+; CM-NEXT: TEX 0 @40
+; CM-NEXT: ALU 3, @61, KC0[], KC1[]
+; CM-NEXT: TEX 0 @42
+; CM-NEXT: ALU 3, @65, KC0[], KC1[]
+; CM-NEXT: TEX 0 @44
+; CM-NEXT: ALU 3, @69, KC0[], KC1[]
+; CM-NEXT: TEX 0 @46
+; CM-NEXT: ALU 3, @73, KC0[], KC1[]
+; CM-NEXT: TEX 0 @48
+; CM-NEXT: ALU 3, @77, KC0[], KC1[]
+; CM-NEXT: TEX 0 @50
+; CM-NEXT: ALU 95, @81, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 76, @177, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 35, @254, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T64, T66.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T63, T65.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T62, T48.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T61, T47.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T60, T46.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T59, T45.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T58, T44.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T57, T43.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T56, T42.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T55, T41.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T54, T40.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T53, T39.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T52, T38.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T51, T37.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T50, T36.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T49, T35.X
; CM-NEXT: CF_END
-; CM-NEXT: PAD
-; CM-NEXT: Fetch clause starting at 24:
-; CM-NEXT: VTX_READ_128 T36.XYZW, T37.X, 16, #1
-; CM-NEXT: VTX_READ_128 T35.XYZW, T37.X, 0, #1
-; CM-NEXT: Fetch clause starting at 28:
-; CM-NEXT: VTX_READ_128 T41.XYZW, T37.X, 112, #1
-; CM-NEXT: VTX_READ_128 T42.XYZW, T37.X, 96, #1
-; CM-NEXT: VTX_READ_128 T43.XYZW, T37.X, 80, #1
-; CM-NEXT: VTX_READ_128 T44.XYZW, T37.X, 64, #1
-; CM-NEXT: VTX_READ_128 T45.XYZW, T37.X, 48, #1
-; CM-NEXT: VTX_READ_128 T37.XYZW, T37.X, 32, #1
-; CM-NEXT: ALU clause starting at 40:
-; CM-NEXT: MOV * T37.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 41:
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: Fetch clause starting at 36:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1
+; CM-NEXT: Fetch clause starting at 38:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1
+; CM-NEXT: Fetch clause starting at 40:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 32, #1
+; CM-NEXT: Fetch clause starting at 42:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 48, #1
+; CM-NEXT: Fetch clause starting at 44:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 64, #1
+; CM-NEXT: Fetch clause starting at 46:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 80, #1
+; CM-NEXT: Fetch clause starting at 48:
+; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 96, #1
+; CM-NEXT: Fetch clause starting at 50:
+; CM-NEXT: VTX_READ_128 T35.XYZW, T35.X, 112, #1
+; CM-NEXT: ALU clause starting at 52:
+; CM-NEXT: MOV * T35.X, KC0[2].Z,
+; CM-NEXT: ALU clause starting at 53:
+; CM-NEXT: MOV * T32.X, T36.Z,
+; CM-NEXT: MOV * T33.X, T36.W,
+; CM-NEXT: MOV * T30.X, T36.X,
+; CM-NEXT: MOV * T31.X, T36.Y,
+; CM-NEXT: ALU clause starting at 57:
+; CM-NEXT: MOV * T28.X, T36.Z,
+; CM-NEXT: MOV * T29.X, T36.W,
+; CM-NEXT: MOV * T26.X, T36.X,
+; CM-NEXT: MOV * T27.X, T36.Y,
+; CM-NEXT: ALU clause starting at 61:
+; CM-NEXT: MOV * T24.X, T36.Z,
+; CM-NEXT: MOV * T25.X, T36.W,
+; CM-NEXT: MOV * T22.X, T36.X,
+; CM-NEXT: MOV * T23.X, T36.Y,
+; CM-NEXT: ALU clause starting at 65:
+; CM-NEXT: MOV * T20.X, T36.Z,
+; CM-NEXT: MOV * T21.X, T36.W,
+; CM-NEXT: MOV * T18.X, T36.X,
+; CM-NEXT: MOV * T19.X, T36.Y,
+; CM-NEXT: ALU clause starting at 69:
+; CM-NEXT: MOV * T16.X, T36.Z,
+; CM-NEXT: MOV * T17.X, T36.W,
+; CM-NEXT: MOV * T14.X, T36.X,
+; CM-NEXT: MOV * T15.X, T36.Y,
+; CM-NEXT: ALU clause starting at 73:
+; CM-NEXT: MOV * T12.X, T36.Z,
+; CM-NEXT: MOV * T13.X, T36.W,
+; CM-NEXT: MOV * T10.X, T36.X,
+; CM-NEXT: MOV * T11.X, T36.Y,
+; CM-NEXT: ALU clause starting at 77:
+; CM-NEXT: MOV * T8.X, T36.Z,
+; CM-NEXT: MOV * T9.X, T36.W,
+; CM-NEXT: MOV * T6.X, T36.X,
+; CM-NEXT: MOV * T7.X, T36.Y,
+; CM-NEXT: ALU clause starting at 81:
+; CM-NEXT: MOV * T4.X, T35.Z,
+; CM-NEXT: MOV * T5.X, T35.W,
+; CM-NEXT: MOV * T2.X, T35.X,
+; CM-NEXT: MOV T3.X, T35.Y,
+; CM-NEXT: MOV T0.Y, T29.X,
+; CM-NEXT: MOV T0.Z, T28.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T0.W, T31.X, BS:VEC_201
+; CM-NEXT: MOV T1.Y, T33.X,
+; CM-NEXT: MOV T1.Z, T32.X, BS:VEC_120/SCL_212
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T38.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: LSHR T35.X, PV.W, literal.x,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43)
-; CM-NEXT: LSHR T39.X, PV.W, literal.x,
-; CM-NEXT: LSHR T0.Y, T35.Z, literal.y,
-; CM-NEXT: LSHR T0.Z, T35.W, literal.y,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T36.X, PV.W, literal.x,
+; CM-NEXT: LSHR T2.Y, T1.Z, literal.y,
+; CM-NEXT: LSHR T2.Z, T1.Y, literal.y,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 192(2.690493e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T40.X, PV.W, literal.x,
-; CM-NEXT: LSHR T1.Y, T35.Y, literal.y,
-; CM-NEXT: LSHR T1.Z, T36.Z, literal.y,
-; CM-NEXT: LSHR * T0.W, T36.W, literal.y,
+; CM-NEXT: LSHR T37.X, PV.W, literal.x,
+; CM-NEXT: LSHR T3.Y, T0.W, literal.y,
+; CM-NEXT: LSHR T3.Z, T0.Z, literal.y,
+; CM-NEXT: LSHR * T1.W, T0.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: ALU clause starting at 57:
-; CM-NEXT: LSHR T2.Z, T36.X, literal.x,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; CM-NEXT: MOV * T2.W, T5.X,
+; CM-NEXT: MOV * T3.W, T2.X,
+; CM-NEXT: MOV T4.Y, T4.X,
+; CM-NEXT: MOV T4.Z, T3.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T4.W, T7.X, BS:VEC_201
+; CM-NEXT: MOV T5.Y, T6.X,
+; CM-NEXT: MOV T5.Z, T9.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T5.W, T8.X, BS:VEC_201
+; CM-NEXT: MOV T6.Y, T11.X,
+; CM-NEXT: MOV T6.Z, T10.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T6.W, T13.X, BS:VEC_201
+; CM-NEXT: MOV T7.Y, T12.X,
+; CM-NEXT: MOV T7.Z, T15.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T7.W, T14.X, BS:VEC_201
+; CM-NEXT: MOV T8.Y, T17.X,
+; CM-NEXT: MOV T8.Z, T16.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T8.W, T19.X, BS:VEC_201
+; CM-NEXT: MOV T9.Y, T18.X,
+; CM-NEXT: MOV T9.Z, T21.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T9.W, T20.X, BS:VEC_201
+; CM-NEXT: MOV T10.Y, T23.X,
+; CM-NEXT: MOV T10.Z, T22.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T10.W, T25.X, BS:VEC_201
+; CM-NEXT: MOV T11.Y, T24.X,
+; CM-NEXT: MOV T11.Z, T27.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T11.W, T26.X, BS:VEC_201
+; CM-NEXT: LSHR T12.Z, PV.W, literal.x,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 208(2.914701e-43)
-; CM-NEXT: LSHR T46.X, PV.W, literal.x,
-; CM-NEXT: LSHR T2.Y, T36.Y, literal.y,
-; CM-NEXT: LSHR T3.Z, T37.Z, literal.y,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T38.X, PV.W, literal.x,
+; CM-NEXT: LSHR T12.Y, T11.Z, literal.y,
+; CM-NEXT: LSHR T13.Z, T11.Y, literal.y,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 160(2.242078e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T47.X, PV.W, literal.x,
-; CM-NEXT: LSHR T3.Y, T37.W, literal.y,
-; CM-NEXT: LSHR T4.Z, T37.X, literal.y,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T39.X, PV.W, literal.x,
+; CM-NEXT: LSHR T13.Y, T10.W, literal.y,
+; CM-NEXT: LSHR T14.Z, T10.Z, literal.y,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 176(2.466285e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T48.X, PV.W, literal.x,
-; CM-NEXT: LSHR T4.Y, T37.Y, literal.y,
-; CM-NEXT: LSHR T5.Z, T45.Z, literal.y,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T40.X, PV.W, literal.x,
+; CM-NEXT: LSHR T14.Y, T10.Y, literal.y,
+; CM-NEXT: LSHR T15.Z, T9.W, literal.y,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 128(1.793662e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T49.X, PV.W, literal.x,
-; CM-NEXT: LSHR T5.Y, T45.W, literal.y,
-; CM-NEXT: LSHR T6.Z, T45.X, literal.y,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T41.X, PV.W, literal.x,
+; CM-NEXT: LSHR T15.Y, T9.Z, literal.y,
+; CM-NEXT: LSHR T16.Z, T9.Y, literal.y,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 144(2.017870e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T50.X, PV.W, literal.x,
-; CM-NEXT: LSHR T6.Y, T45.Y, literal.y,
-; CM-NEXT: LSHR T7.Z, T44.Z, literal.y,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T42.X, PV.W, literal.x,
+; CM-NEXT: LSHR T16.Y, T8.W, literal.y,
+; CM-NEXT: LSHR T17.Z, T8.Z, literal.y,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 96(1.345247e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T51.X, PV.W, literal.x,
-; CM-NEXT: LSHR T7.Y, T44.W, literal.y,
-; CM-NEXT: LSHR T8.Z, T44.X, literal.y,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T43.X, PV.W, literal.x,
+; CM-NEXT: LSHR T17.Y, T8.Y, literal.y,
+; CM-NEXT: LSHR T18.Z, T7.W, literal.y,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T52.X, PV.W, literal.x,
-; CM-NEXT: LSHR T8.Y, T44.Y, literal.y,
-; CM-NEXT: LSHR T9.Z, T43.Z, literal.y,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T44.X, PV.W, literal.x,
+; CM-NEXT: LSHR T18.Y, T7.Z, literal.y,
+; CM-NEXT: LSHR T19.Z, T7.Y, literal.y,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 64(8.968310e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T53.X, PV.W, literal.x,
-; CM-NEXT: LSHR T9.Y, T43.W, literal.y,
-; CM-NEXT: LSHR T10.Z, T43.X, literal.y,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: ALU clause starting at 177:
+; CM-NEXT: LSHR T45.X, T12.W, literal.x,
+; CM-NEXT: LSHR T19.Y, T6.W, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT: LSHR T20.Z, T6.Z, literal.y,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 80(1.121039e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T54.X, PV.W, literal.x,
-; CM-NEXT: LSHR T10.Y, T43.Y, literal.y,
-; CM-NEXT: LSHR T11.Z, T42.Z, literal.y,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T46.X, PV.W, literal.x,
+; CM-NEXT: LSHR T20.Y, T6.Y, literal.y,
+; CM-NEXT: LSHR T21.Z, T5.W, literal.y,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T55.X, PV.W, literal.x,
-; CM-NEXT: LSHR T11.Y, T42.W, literal.y,
-; CM-NEXT: LSHR T12.Z, T42.X, literal.y,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: LSHR T47.X, PV.W, literal.x,
+; CM-NEXT: LSHR T21.Y, T5.Z, literal.y,
+; CM-NEXT: LSHR T22.Z, T5.Y, literal.y,
+; CM-NEXT: ADD_INT * T12.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T56.X, PV.W, literal.x,
-; CM-NEXT: LSHR T12.Y, T42.Y, literal.y,
-; CM-NEXT: BFE_INT T57.Z, T41.Y, 0.0, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT: LSHR * T1.W, T41.Z, literal.y,
+; CM-NEXT: LSHR T48.X, PV.W, literal.x,
+; CM-NEXT: LSHR T22.Y, T4.W, literal.y,
+; CM-NEXT: BFE_INT T49.Z, T4.Z, 0.0, literal.y,
+; CM-NEXT: LSHR * T12.W, T4.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: BFE_INT T57.X, T41.X, 0.0, literal.x,
-; CM-NEXT: LSHR T13.Y, T41.W, literal.x,
-; CM-NEXT: BFE_INT T58.Z, T41.W, 0.0, literal.x,
-; CM-NEXT: LSHR * T2.W, T41.Y, literal.x,
+; CM-NEXT: BFE_INT T49.X, T3.W, 0.0, literal.x,
+; CM-NEXT: LSHR T23.Y, T2.W, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T50.Z, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: LSHR * T2.W, T4.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T58.X, T41.Z, 0.0, literal.x,
-; CM-NEXT: LSHR T14.Y, T41.X, literal.x,
-; CM-NEXT: BFE_INT T41.Z, T42.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T57.W, PV.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T50.X, T4.Y, 0.0, literal.x,
+; CM-NEXT: LSHR T4.Y, T3.W, literal.x,
+; CM-NEXT: BFE_INT T51.Z, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT * T49.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T41.X, T42.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T57.Y, PV.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT T59.Z, T42.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T58.W, T13.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T51.X, T5.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T49.Y, PV.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T52.Z, T5.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T50.W, T23.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: ALU clause starting at 140:
-; CM-NEXT: BFE_INT T59.X, T42.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T58.Y, T1.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT T42.Z, T43.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T41.W, T12.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T2.W, T30.X,
+; CM-NEXT: BFE_INT T52.X, T5.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T50.Y, T12.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T53.Z, T6.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T51.W, T22.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T42.X, T43.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T41.Y, T12.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T60.Z, T43.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T59.W, T11.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T53.X, T6.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T51.Y, T22.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T54.Z, T6.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T52.W, T21.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T60.X, T43.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T59.Y, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT: BFE_INT T43.Z, T44.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T42.W, T10.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T54.X, T7.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T52.Y, T21.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T55.Z, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT * T53.W, T20.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T43.X, T44.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T42.Y, T10.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T61.Z, T44.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T60.W, T9.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T55.X, T7.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T53.Y, T20.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T56.Z, T8.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T54.W, T19.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T61.X, T44.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T60.Y, T9.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT: BFE_INT T44.Z, T45.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T43.W, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T56.X, T8.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T54.Y, T19.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T57.Z, T8.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T55.W, T18.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T44.X, T45.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T43.Y, T8.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T62.Z, T45.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T61.W, T7.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T57.X, T9.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T55.Y, T18.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T58.Z, T9.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT * T56.W, T17.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T62.X, T45.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T61.Y, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT: BFE_INT T45.Z, T37.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T44.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T58.X, T9.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T56.Y, T17.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T59.Z, T10.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T57.W, T16.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T45.X, T37.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T44.Y, T6.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T63.Z, T37.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T62.W, T5.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T59.X, T10.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T57.Y, T16.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T63.X, T37.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T62.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT: BFE_INT T37.Z, T36.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T45.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: ALU clause starting at 254:
+; CM-NEXT: BFE_INT T60.Z, T10.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T58.W, T15.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T37.X, T36.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T45.Y, T4.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T64.Z, T36.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T63.W, T3.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T60.X, T11.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T58.Y, T15.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T61.Z, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT * T59.W, T14.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T64.X, T36.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T63.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT: BFE_INT T36.Z, T35.Y, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T37.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T61.X, T11.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T59.Y, T14.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T62.Z, T0.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T60.W, T13.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T36.X, T35.X, 0.0, literal.x,
-; CM-NEXT: BFE_INT T37.Y, T2.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T65.Z, T35.W, 0.0, literal.x,
-; CM-NEXT: BFE_INT * T64.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T62.X, T0.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T60.Y, T13.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: BFE_INT T63.Z, T0.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T61.W, T12.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T65.X, T35.Z, 0.0, literal.x,
-; CM-NEXT: BFE_INT T64.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT: LSHR T1.Z, T35.X, literal.x,
-; CM-NEXT: BFE_INT * T36.W, T1.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T63.X, T2.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T61.Y, T12.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T64.Z, T1.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T62.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: BFE_INT T64.X, T1.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T62.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: LSHR T0.Z, T2.W, literal.x,
+; CM-NEXT: BFE_INT * T63.W, T3.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T35.X, KC0[2].Y, literal.x,
-; CM-NEXT: BFE_INT T36.Y, PV.Z, 0.0, literal.y,
-; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y,
-; CM-NEXT: BFE_INT * T65.W, T0.Z, 0.0, literal.y,
+; CM-NEXT: LSHR T65.X, KC0[2].Y, literal.x,
+; CM-NEXT: BFE_INT T63.Y, PV.Z, 0.0, literal.y,
+; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
+; CM-NEXT: BFE_INT * T64.W, T2.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR T66.X, PV.Z, literal.x,
-; CM-NEXT: BFE_INT * T65.Y, T0.Y, 0.0, literal.y,
+; CM-NEXT: BFE_INT * T64.Y, T2.Y, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <64 x i16>, ptr addrspace(1) %in
%ext = sext <64 x i16> %load to <64 x i32>
@@ -5566,7 +6250,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -5575,9 +6259,11 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T4.Z, T4.X, literal.x,
+; EG-NEXT: MOV * T2.X, T4.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: LSHR * T4.Z, PV.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
+; EG-NEXT: AND_INT T4.X, T0.Y, literal.x,
; EG-NEXT: MOV T4.Y, 0.0,
; EG-NEXT: MOV T4.W, 0.0,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
@@ -5587,7 +6273,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
-; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
@@ -5596,9 +6282,11 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T4.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: LSHR * T4.Z, T4.X, literal.x,
+; CM-NEXT: MOV * T2.X, T4.X,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: LSHR * T4.Z, PV.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T4.X, T4.X, literal.x,
+; CM-NEXT: AND_INT T4.X, T0.Y, literal.x,
; CM-NEXT: MOV T4.Y, 0.0,
; CM-NEXT: MOV * T4.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
@@ -5807,25 +6495,29 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
+; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T6.Z, T5.Y, literal.x,
+; EG-NEXT: MOV T2.X, T5.X,
+; EG-NEXT: MOV * T3.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV * T0.Z, PS,
+; EG-NEXT: LSHR * T5.Z, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T6.X, T5.Y, literal.x,
-; EG-NEXT: MOV T6.Y, 0.0,
-; EG-NEXT: LSHR T5.Z, T5.X, literal.y,
-; EG-NEXT: AND_INT * T5.X, T5.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: AND_INT T5.X, T0.Z, literal.x,
; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV T6.W, 0.0,
-; EG-NEXT: MOV * T5.W, 0.0,
+; EG-NEXT: LSHR T6.Z, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T6.X, T0.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: MOV T6.Y, 0.0,
+; EG-NEXT: MOV T5.W, 0.0,
+; EG-NEXT: MOV * T6.W, 0.0,
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
@@ -5836,26 +6528,30 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
-; CM-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T8.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T7.X
+; CM-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T8.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: LSHR * T6.Z, T5.X, literal.x,
+; CM-NEXT: MOV * T2.X, T5.X,
+; CM-NEXT: MOV * T3.X, T5.Y,
+; CM-NEXT: MOV T0.Y, PV.X,
+; CM-NEXT: MOV * T0.Z, T2.X,
+; CM-NEXT: LSHR * T5.Z, PV.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T6.X, T5.X, literal.x,
-; CM-NEXT: MOV T6.Y, 0.0,
-; CM-NEXT: LSHR * T5.Z, T5.Y, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T5.X, T5.Y, literal.x,
+; CM-NEXT: AND_INT T5.X, T0.Z, literal.x,
; CM-NEXT: MOV T5.Y, 0.0,
-; CM-NEXT: MOV * T6.W, 0.0,
-; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: LSHR * T6.Z, T0.Y, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: AND_INT T6.X, T0.Y, literal.x,
+; CM-NEXT: MOV T6.Y, 0.0,
; CM-NEXT: MOV * T5.W, 0.0,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: MOV * T6.W, 0.0,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T7.X, PV.W, literal.x,
@@ -5956,62 +6652,69 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
+; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: ASHR * T5.W, T5.X, literal.x,
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
-; EG-NEXT: ASHR T5.Z, T5.X, literal.y,
-; EG-NEXT: ASHR * T7.W, T5.Y, literal.z,
+; EG-NEXT: MOV T2.X, T5.X,
+; EG-NEXT: MOV * T3.X, T5.Y,
+; EG-NEXT: MOV T0.Y, PS,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ASHR * T6.W, T5.Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: LSHR T7.X, PV.W, literal.x,
+; EG-NEXT: ASHR T6.Z, T5.Y, literal.y,
+; EG-NEXT: ASHR * T5.W, T5.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
-; EG-NEXT: ASHR * T7.Z, T5.Y, literal.x,
+; EG-NEXT: BFE_INT T6.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: ASHR * T5.Z, T5.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T5.Y, PV.X, literal.y,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
+; EG-NEXT: ASHR T6.Y, PV.X, literal.y,
+; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.z,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: LSHR T8.X, PV.W, literal.x,
-; EG-NEXT: ASHR * T7.Y, PV.X, literal.y,
-; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ASHR * T5.Y, PV.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_v4i16_to_v4i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
-; CM-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T8.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
+; CM-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
-; CM-NEXT: ASHR * T6.W, T5.Y, literal.y,
-; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: LSHR T7.X, PV.Z, literal.x,
-; CM-NEXT: ASHR T6.Z, T5.Y, literal.y,
-; CM-NEXT: ASHR * T5.W, T5.X, literal.z,
+; CM-NEXT: MOV * T2.X, T5.X,
+; CM-NEXT: MOV * T3.X, T5.Y,
+; CM-NEXT: MOV T0.Y, PV.X,
+; CM-NEXT: ASHR * T5.W, T5.X, literal.x,
+; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
+; CM-NEXT: ASHR T5.Z, T5.X, literal.y,
+; CM-NEXT: ASHR * T7.W, T5.Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T6.X, T5.Y, 0.0, literal.x,
-; CM-NEXT: ASHR * T5.Z, T5.X, literal.x,
-; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
-; CM-NEXT: ASHR * T6.Y, PV.X, literal.y,
+; CM-NEXT: ASHR * T7.Z, T5.Y, literal.x,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x,
+; CM-NEXT: ASHR T5.Y, PV.X, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
-; CM-NEXT: ASHR * T5.Y, PV.X, literal.y,
+; CM-NEXT: LSHR T8.X, PV.W, literal.x,
+; CM-NEXT: ASHR * T7.Y, PV.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%load = load <4 x i16>, ptr addrspace(1) %in
%ext = sext <4 x i16> %load to <4 x i64>
@@ -6140,37 +6843,45 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 30, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T13.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 1
+; EG-NEXT: ALU 38, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T13.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T11.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHR * T8.Z, T7.W, literal.x,
+; EG-NEXT: MOV T3.X, T7.Y,
+; EG-NEXT: MOV * T2.X, T7.X,
+; EG-NEXT: MOV T5.X, T7.W,
+; EG-NEXT: MOV * T4.X, T7.Z,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: MOV T0.Z, T3.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T0.W, PS,
+; EG-NEXT: MOV * T1.Y, PV.X,
+; EG-NEXT: LSHR * T7.Z, PS, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T8.X, T7.W, literal.x,
-; EG-NEXT: MOV T8.Y, 0.0,
-; EG-NEXT: LSHR T9.Z, T7.Z, literal.y,
-; EG-NEXT: AND_INT * T9.X, T7.Z, literal.x,
+; EG-NEXT: AND_INT T7.X, T1.Y, literal.x,
+; EG-NEXT: MOV T7.Y, 0.0,
+; EG-NEXT: LSHR T8.Z, T0.W, literal.y,
+; EG-NEXT: AND_INT * T8.X, T0.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T9.Y, 0.0,
-; EG-NEXT: LSHR * T10.Z, T7.Y, literal.x,
+; EG-NEXT: MOV T8.Y, 0.0,
+; EG-NEXT: LSHR * T9.Z, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T10.X, T7.Y, literal.x,
-; EG-NEXT: MOV T10.Y, 0.0,
-; EG-NEXT: LSHR T7.Z, T7.X, literal.y,
-; EG-NEXT: AND_INT * T7.X, T7.X, literal.x,
+; EG-NEXT: AND_INT T9.X, T0.Z, literal.x,
+; EG-NEXT: MOV T9.Y, 0.0,
+; EG-NEXT: LSHR T10.Z, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T10.X, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T7.Y, 0.0,
-; EG-NEXT: MOV T8.W, 0.0,
-; EG-NEXT: MOV * T9.W, 0.0,
-; EG-NEXT: MOV T10.W, 0.0,
-; EG-NEXT: MOV * T7.W, 0.0,
+; EG-NEXT: MOV T10.Y, 0.0,
+; EG-NEXT: MOV T7.W, 0.0,
+; EG-NEXT: MOV * T8.W, 0.0,
+; EG-NEXT: MOV T9.W, 0.0,
+; EG-NEXT: MOV * T10.W, 0.0,
; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
@@ -6187,38 +6898,46 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @8
-; CM-NEXT: ALU 32, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T14.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T9, T13.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T10, T12.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T11.X
+; CM-NEXT: ALU 40, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T14.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T13.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T9, T12.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T10, T11.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 8:
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; CM-NEXT: ALU clause starting at 10:
; CM-NEXT: MOV * T7.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 11:
-; CM-NEXT: LSHR * T8.Z, T7.X, literal.x,
+; CM-NEXT: MOV * T3.X, T7.Y,
+; CM-NEXT: MOV * T2.X, T7.X,
+; CM-NEXT: MOV * T5.X, T7.W,
+; CM-NEXT: MOV T4.X, T7.Z,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: MOV T0.Z, PV.X,
+; CM-NEXT: MOV * T0.W, T3.X,
+; CM-NEXT: MOV * T1.Y, T2.X,
+; CM-NEXT: LSHR * T7.Z, PV.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T8.X, T7.X, literal.x,
+; CM-NEXT: AND_INT T7.X, T1.Y, literal.x,
+; CM-NEXT: MOV T7.Y, 0.0,
+; CM-NEXT: LSHR * T8.Z, T0.W, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: AND_INT T8.X, T0.W, literal.x,
; CM-NEXT: MOV T8.Y, 0.0,
-; CM-NEXT: LSHR * T9.Z, T7.Y, literal.y,
+; CM-NEXT: LSHR * T9.Z, T0.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T9.X, T7.Y, literal.x,
+; CM-NEXT: AND_INT T9.X, T0.Z, literal.x,
; CM-NEXT: MOV T9.Y, 0.0,
-; CM-NEXT: LSHR * T10.Z, T7.Z, literal.y,
+; CM-NEXT: LSHR * T10.Z, T0.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T10.X, T7.Z, literal.x,
+; CM-NEXT: AND_INT T10.X, T0.Y, literal.x,
; CM-NEXT: MOV T10.Y, 0.0,
-; CM-NEXT: LSHR * T7.Z, T7.W, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T7.X, T7.W, literal.x,
-; CM-NEXT: MOV T7.Y, 0.0,
-; CM-NEXT: MOV * T8.W, 0.0,
+; CM-NEXT: MOV * T7.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: MOV * T8.W, 0.0,
; CM-NEXT: MOV * T9.W, 0.0,
; CM-NEXT: MOV * T10.W, 0.0,
-; CM-NEXT: MOV * T7.W, 0.0,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T11.X, PV.W, literal.x,
@@ -6365,101 +7084,107 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 33, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 36, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T9.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T8.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T9.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T5.X, T7.W,
+; EG-NEXT: MOV * T3.X, T7.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV T0.Z, PS,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T8.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR T9.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: ASHR * T10.W, T7.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T11.X, PV.W, literal.x,
-; EG-NEXT: ASHR T10.Z, T7.X, literal.y,
-; EG-NEXT: ASHR * T12.W, T7.Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T10.X, T7.X, 0.0, literal.x,
-; EG-NEXT: ASHR T12.Z, T7.Y, literal.x,
-; EG-NEXT: ASHR * T13.W, T7.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T12.X, T7.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T10.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T13.Z, T7.Z, literal.x,
-; EG-NEXT: ASHR * T14.W, T7.W, literal.y,
+; EG-NEXT: LSHR * T10.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T11.X, T7.Y, 0.0, literal.x,
+; EG-NEXT: ASHR * T12.W, T7.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T13.X, T7.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T12.Y, PV.X, literal.y,
-; EG-NEXT: ASHR * T14.Z, T7.W, literal.x,
+; EG-NEXT: BFE_INT T13.X, T7.W, 0.0, literal.x,
+; EG-NEXT: ASHR T11.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T12.Z, T7.X, literal.x,
+; EG-NEXT: ASHR * T14.W, T7.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T14.X, T7.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T12.X, T7.X, 0.0, literal.x,
; EG-NEXT: ASHR T13.Y, PV.X, literal.y,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR T14.Z, T7.Z, literal.x,
+; EG-NEXT: ASHR * T11.W, T0.Z, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T14.X, T7.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T12.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T11.Z, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR * T13.W, T0.Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T7.X, PV.W, literal.x,
-; EG-NEXT: ASHR * T14.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T14.Y, PV.X, literal.y,
+; EG-NEXT: ASHR * T13.Z, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_v8i16_to_v8i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @8
-; CM-NEXT: ALU 33, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T14.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T11.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T9.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T10, T8.X
+; CM-NEXT: ALU 36, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T14.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T10.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T9.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T8.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 8:
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; CM-NEXT: ALU clause starting at 10:
; CM-NEXT: MOV * T7.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 11:
+; CM-NEXT: MOV * T5.X, T7.W,
+; CM-NEXT: MOV * T3.X, T7.Y,
+; CM-NEXT: MOV T0.Y, PV.X,
+; CM-NEXT: MOV T0.Z, T5.X,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T8.X, PV.W, literal.x,
+; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; CM-NEXT: LSHR * T8.X, PV.W, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T9.X, KC0[2].Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; CM-NEXT: LSHR T9.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
-; CM-NEXT: ASHR * T10.W, T7.W, literal.z,
-; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T11.X, PV.Z, literal.x,
-; CM-NEXT: ASHR T10.Z, T7.W, literal.y,
-; CM-NEXT: ASHR * T12.W, T7.Z, literal.z,
-; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T10.X, T7.W, 0.0, literal.x,
-; CM-NEXT: ASHR T12.Z, T7.Z, literal.x,
-; CM-NEXT: ASHR * T13.W, T7.Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
+; CM-NEXT: LSHR * T10.X, PV.W, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: BFE_INT T11.X, T7.W, 0.0, literal.x,
+; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
+; CM-NEXT: ASHR * T12.W, T7.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T12.X, T7.Z, 0.0, literal.x,
-; CM-NEXT: ASHR T10.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T13.Z, T7.Y, literal.x,
+; CM-NEXT: BFE_INT T13.X, T7.Y, 0.0, literal.x,
+; CM-NEXT: ASHR T11.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T12.Z, T7.Z, literal.x,
; CM-NEXT: ASHR * T7.W, T7.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T13.X, T7.Y, 0.0, literal.x,
-; CM-NEXT: ASHR T12.Y, PV.X, literal.y,
-; CM-NEXT: ASHR * T7.Z, T7.X, literal.x,
+; CM-NEXT: BFE_INT T12.X, T7.Z, 0.0, literal.x,
+; CM-NEXT: ASHR T13.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T7.Z, T7.X, literal.x,
+; CM-NEXT: ASHR * T11.W, T0.Z, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T7.X, T7.X, 0.0, literal.x,
-; CM-NEXT: ASHR * T13.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T12.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T11.Z, T0.Z, literal.x,
+; CM-NEXT: ASHR * T13.W, T0.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x,
-; CM-NEXT: ASHR * T7.Y, PV.X, literal.y,
+; CM-NEXT: LSHR T14.X, T1.Z, literal.x,
+; CM-NEXT: ASHR T7.Y, PV.X, literal.y,
+; CM-NEXT: ASHR * T13.Z, T0.Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
%load = load <8 x i16>, ptr addrspace(1) %in
%ext = sext <8 x i16> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -6671,64 +7396,84 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
;
; EG-LABEL: global_zextload_v16i16_to_v16i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @12
-; EG-NEXT: ALU 62, @17, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T23.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T21.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T19.X, 1
+; EG-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @14
+; EG-NEXT: ALU 3, @19, KC0[], KC1[]
+; EG-NEXT: TEX 0 @16
+; EG-NEXT: ALU 74, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T22.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T21.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T20.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T19.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
-; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; EG-NEXT: ALU clause starting at 16:
+; EG-NEXT: Fetch clause starting at 14:
+; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; EG-NEXT: Fetch clause starting at 16:
+; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; EG-NEXT: ALU clause starting at 18:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 17:
-; EG-NEXT: LSHR * T13.Z, T12.W, literal.x,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: MOV T7.X, T12.Y,
+; EG-NEXT: MOV * T6.X, T12.X,
+; EG-NEXT: MOV T9.X, T12.W,
+; EG-NEXT: MOV * T8.X, T12.Z,
+; EG-NEXT: ALU clause starting at 23:
+; EG-NEXT: MOV T3.X, T11.Y,
+; EG-NEXT: MOV * T2.X, T11.X,
+; EG-NEXT: MOV T5.X, T11.W,
+; EG-NEXT: MOV * T4.X, T11.Z,
+; EG-NEXT: MOV T0.Y, T6.X,
+; EG-NEXT: MOV T0.Z, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T8.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T9.X,
+; EG-NEXT: MOV T1.Z, T2.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T3.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T4.X,
+; EG-NEXT: MOV * T2.Z, T5.X, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T11.Z, PV.Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T11.X, T2.Z, literal.x,
+; EG-NEXT: MOV T11.Y, 0.0,
+; EG-NEXT: LSHR T12.Z, T2.Y, literal.y,
+; EG-NEXT: AND_INT * T12.X, T2.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: MOV T12.Y, 0.0,
+; EG-NEXT: LSHR * T13.Z, T1.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T13.X, T12.W, literal.x,
+; EG-NEXT: AND_INT T13.X, T1.W, literal.x,
; EG-NEXT: MOV T13.Y, 0.0,
-; EG-NEXT: LSHR T14.Z, T12.Z, literal.y,
-; EG-NEXT: AND_INT * T14.X, T12.Z, literal.x,
+; EG-NEXT: LSHR T14.Z, T1.Z, literal.y,
+; EG-NEXT: AND_INT * T14.X, T1.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T14.Y, 0.0,
-; EG-NEXT: LSHR * T15.Z, T12.Y, literal.x,
+; EG-NEXT: LSHR * T15.Z, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T15.X, T12.Y, literal.x,
+; EG-NEXT: AND_INT T15.X, T1.Y, literal.x,
; EG-NEXT: MOV T15.Y, 0.0,
-; EG-NEXT: LSHR T12.Z, T12.X, literal.y,
-; EG-NEXT: AND_INT * T12.X, T12.X, literal.x,
+; EG-NEXT: LSHR T16.Z, T0.W, literal.y,
+; EG-NEXT: AND_INT * T16.X, T0.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T12.Y, 0.0,
-; EG-NEXT: LSHR * T16.Z, T11.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T16.X, T11.W, literal.x,
; EG-NEXT: MOV T16.Y, 0.0,
-; EG-NEXT: LSHR T17.Z, T11.Z, literal.y,
-; EG-NEXT: AND_INT * T17.X, T11.Z, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T17.Y, 0.0,
-; EG-NEXT: LSHR * T18.Z, T11.Y, literal.x,
+; EG-NEXT: LSHR * T17.Z, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T18.X, T11.Y, literal.x,
-; EG-NEXT: MOV T18.Y, 0.0,
-; EG-NEXT: LSHR T11.Z, T11.X, literal.y,
-; EG-NEXT: AND_INT * T11.X, T11.X, literal.x,
+; EG-NEXT: AND_INT T17.X, T0.Z, literal.x,
+; EG-NEXT: MOV T17.Y, 0.0,
+; EG-NEXT: LSHR T18.Z, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T18.X, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T11.Y, 0.0,
+; EG-NEXT: MOV T18.Y, 0.0,
+; EG-NEXT: MOV T11.W, 0.0,
+; EG-NEXT: MOV * T12.W, 0.0,
; EG-NEXT: MOV T13.W, 0.0,
; EG-NEXT: MOV * T14.W, 0.0,
; EG-NEXT: MOV T15.W, 0.0,
-; EG-NEXT: MOV * T12.W, 0.0,
-; EG-NEXT: MOV T16.W, 0.0,
-; EG-NEXT: MOV * T17.W, 0.0,
-; EG-NEXT: MOV T18.W, 0.0,
-; EG-NEXT: MOV * T11.W, 0.0,
+; EG-NEXT: MOV * T16.W, 0.0,
+; EG-NEXT: MOV T17.W, 0.0,
+; EG-NEXT: MOV * T18.W, 0.0,
; EG-NEXT: LSHR T19.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
@@ -6755,65 +7500,85 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
;
; CM-LABEL: global_zextload_v16i16_to_v16i64:
; CM: ; %bb.0:
-; CM-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 1 @12
-; CM-NEXT: ALU 64, @17, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T26.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T14, T25.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T15, T24.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T23.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T16, T22.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T17, T21.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T18, T20.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T19.X
+; CM-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 0 @14
+; CM-NEXT: ALU 3, @19, KC0[], KC1[]
+; CM-NEXT: TEX 0 @16
+; CM-NEXT: ALU 76, @23, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T26.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T25.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T24.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T14, T23.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T15, T22.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T16, T21.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T17, T20.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T18, T19.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 12:
+; CM-NEXT: Fetch clause starting at 14:
; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; CM-NEXT: Fetch clause starting at 16:
; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
-; CM-NEXT: ALU clause starting at 16:
+; CM-NEXT: ALU clause starting at 18:
; CM-NEXT: MOV * T11.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 17:
-; CM-NEXT: LSHR * T13.Z, T12.X, literal.x,
+; CM-NEXT: ALU clause starting at 19:
+; CM-NEXT: MOV * T7.X, T12.Y,
+; CM-NEXT: MOV * T6.X, T12.X,
+; CM-NEXT: MOV * T9.X, T12.W,
+; CM-NEXT: MOV * T8.X, T12.Z,
+; CM-NEXT: ALU clause starting at 23:
+; CM-NEXT: MOV * T3.X, T11.Y,
+; CM-NEXT: MOV * T2.X, T11.X,
+; CM-NEXT: MOV * T5.X, T11.W,
+; CM-NEXT: MOV T4.X, T11.Z,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: MOV T0.Z, PV.X,
+; CM-NEXT: MOV * T0.W, T3.X,
+; CM-NEXT: MOV T1.Y, T2.X,
+; CM-NEXT: MOV T1.Z, T9.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T1.W, T8.X, BS:VEC_201
+; CM-NEXT: MOV T2.Y, T7.X,
+; CM-NEXT: MOV * T2.Z, T6.X, BS:VEC_120/SCL_212
+; CM-NEXT: LSHR * T11.Z, PV.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T13.X, T12.X, literal.x,
+; CM-NEXT: AND_INT T11.X, T2.Z, literal.x,
+; CM-NEXT: MOV T11.Y, 0.0,
+; CM-NEXT: LSHR * T12.Z, T2.Y, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: AND_INT T12.X, T2.Y, literal.x,
+; CM-NEXT: MOV T12.Y, 0.0,
+; CM-NEXT: LSHR * T13.Z, T1.W, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: AND_INT T13.X, T1.W, literal.x,
; CM-NEXT: MOV T13.Y, 0.0,
-; CM-NEXT: LSHR * T14.Z, T12.Y, literal.y,
+; CM-NEXT: LSHR * T14.Z, T1.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T14.X, T12.Y, literal.x,
+; CM-NEXT: AND_INT T14.X, T1.Z, literal.x,
; CM-NEXT: MOV T14.Y, 0.0,
-; CM-NEXT: LSHR * T15.Z, T12.Z, literal.y,
+; CM-NEXT: LSHR * T15.Z, T1.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T15.X, T12.Z, literal.x,
+; CM-NEXT: AND_INT T15.X, T1.Y, literal.x,
; CM-NEXT: MOV T15.Y, 0.0,
-; CM-NEXT: LSHR * T12.Z, T12.W, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T12.X, T12.W, literal.x,
-; CM-NEXT: MOV T12.Y, 0.0,
-; CM-NEXT: LSHR * T16.Z, T11.X, literal.y,
+; CM-NEXT: LSHR * T16.Z, T0.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T16.X, T11.X, literal.x,
+; CM-NEXT: AND_INT T16.X, T0.W, literal.x,
; CM-NEXT: MOV T16.Y, 0.0,
-; CM-NEXT: LSHR * T17.Z, T11.Y, literal.y,
+; CM-NEXT: LSHR * T17.Z, T0.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T17.X, T11.Y, literal.x,
+; CM-NEXT: AND_INT T17.X, T0.Z, literal.x,
; CM-NEXT: MOV T17.Y, 0.0,
-; CM-NEXT: LSHR * T18.Z, T11.Z, literal.y,
+; CM-NEXT: LSHR * T18.Z, T0.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T18.X, T11.Z, literal.x,
+; CM-NEXT: AND_INT T18.X, T0.Y, literal.x,
; CM-NEXT: MOV T18.Y, 0.0,
-; CM-NEXT: LSHR * T11.Z, T11.W, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T11.X, T11.W, literal.x,
-; CM-NEXT: MOV T11.Y, 0.0,
-; CM-NEXT: MOV * T13.W, 0.0,
+; CM-NEXT: MOV * T11.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: MOV * T12.W, 0.0,
+; CM-NEXT: MOV * T13.W, 0.0,
; CM-NEXT: MOV * T14.W, 0.0,
; CM-NEXT: MOV * T15.W, 0.0,
-; CM-NEXT: MOV * T12.W, 0.0,
; CM-NEXT: MOV * T16.W, 0.0,
; CM-NEXT: MOV * T17.W, 0.0,
; CM-NEXT: MOV * T18.W, 0.0,
-; CM-NEXT: MOV * T11.W, 0.0,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T19.X, PV.W, literal.x,
@@ -7063,177 +7828,200 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
;
; EG-LABEL: global_sextload_v16i16_to_v16i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @12
-; EG-NEXT: ALU 65, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @14
+; EG-NEXT: ALU 1, @19, KC0[], KC1[]
+; EG-NEXT: TEX 0 @16
+; EG-NEXT: ALU 71, @21, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T18.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T17.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T15.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T13.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T17.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T16.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T13.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: Fetch clause starting at 14:
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
+; EG-NEXT: Fetch clause starting at 16:
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; EG-NEXT: ALU clause starting at 16:
+; EG-NEXT: ALU clause starting at 18:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 17:
-; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: MOV T5.X, T12.W,
+; EG-NEXT: MOV * T3.X, T12.Y,
+; EG-NEXT: ALU clause starting at 21:
+; EG-NEXT: MOV T9.X, T11.W,
+; EG-NEXT: MOV * T7.X, T11.Y,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: MOV T0.Z, T3.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: MOV * T1.Y, PS,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T13.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR T14.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T15.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR T16.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR T17.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT: LSHR * T17.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T18.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: ASHR * T19.W, T11.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T20.X, PV.W, literal.x,
-; EG-NEXT: ASHR T19.Z, T11.X, literal.y,
-; EG-NEXT: ASHR * T21.W, T11.Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T19.X, T11.X, 0.0, literal.x,
-; EG-NEXT: ASHR T21.Z, T11.Y, literal.x,
-; EG-NEXT: ASHR * T22.W, T11.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T21.X, T11.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T19.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T22.Z, T11.Z, literal.x,
-; EG-NEXT: ASHR * T23.W, T11.W, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT: LSHR * T19.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T20.X, T11.Y, 0.0, literal.x,
+; EG-NEXT: ASHR * T21.W, T11.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T22.X, T11.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T21.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T23.Z, T11.W, literal.x,
-; EG-NEXT: ASHR * T24.W, T12.X, literal.y,
+; EG-NEXT: BFE_INT T22.X, T11.W, 0.0, literal.x,
+; EG-NEXT: ASHR T20.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T21.Z, T11.X, literal.x,
+; EG-NEXT: ASHR * T23.W, T11.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T23.X, T11.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T21.X, T11.X, 0.0, literal.x,
; EG-NEXT: ASHR T22.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T24.Z, T12.X, literal.x,
-; EG-NEXT: ASHR * T11.W, T12.Y, literal.y,
+; EG-NEXT: ASHR T23.Z, T11.Z, literal.x,
+; EG-NEXT: ASHR * T20.W, T1.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T24.X, T12.X, 0.0, literal.x,
-; EG-NEXT: ASHR T23.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T11.Z, T12.Y, literal.x,
-; EG-NEXT: ASHR * T25.W, T12.Z, literal.y,
+; EG-NEXT: BFE_INT T23.X, T11.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T21.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T20.Z, T1.Y, literal.x,
+; EG-NEXT: ASHR * T22.W, T0.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T11.X, T12.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T24.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T25.Z, T12.Z, literal.x,
-; EG-NEXT: ASHR * T26.W, T12.W, literal.y,
+; EG-NEXT: ASHR T23.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T22.Z, T0.W, literal.x,
+; EG-NEXT: ASHR * T24.W, T12.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T25.X, T12.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T25.X, T12.W, 0.0, literal.x,
; EG-NEXT: ASHR T11.Y, PV.X, literal.y,
-; EG-NEXT: ASHR * T26.Z, T12.W, literal.x,
+; EG-NEXT: ASHR T24.Z, T12.X, literal.x,
+; EG-NEXT: ASHR * T26.W, T12.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T26.X, T12.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T24.X, T12.X, 0.0, literal.x,
; EG-NEXT: ASHR T25.Y, PV.X, literal.y,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR T26.Z, T12.Z, literal.x,
+; EG-NEXT: ASHR * T11.W, T0.Z, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T26.X, T12.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T24.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T11.Z, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR * T25.W, T0.Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T12.X, PV.W, literal.x,
-; EG-NEXT: ASHR * T26.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T26.Y, PV.X, literal.y,
+; EG-NEXT: ASHR * T25.Z, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_v16i16_to_v16i64:
; CM: ; %bb.0:
-; CM-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 1 @12
-; CM-NEXT: ALU 65, @17, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T26.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T20.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T18.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T17.X
+; CM-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 0 @14
+; CM-NEXT: ALU 1, @19, KC0[], KC1[]
+; CM-NEXT: TEX 0 @16
+; CM-NEXT: ALU 70, @21, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T26.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T19.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T18.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T17.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T16.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T15.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T14.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T13.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T15.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T14.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T13.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 12:
-; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
-; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
-; CM-NEXT: ALU clause starting at 16:
+; CM-NEXT: Fetch clause starting at 14:
+; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
+; CM-NEXT: Fetch clause starting at 16:
+; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
+; CM-NEXT: ALU clause starting at 18:
; CM-NEXT: MOV * T11.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 17:
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
+; CM-NEXT: ALU clause starting at 19:
+; CM-NEXT: MOV * T5.X, T12.W,
+; CM-NEXT: MOV * T3.X, T12.Y,
+; CM-NEXT: ALU clause starting at 21:
+; CM-NEXT: MOV * T9.X, T11.W,
+; CM-NEXT: MOV * T7.X, T11.Y,
+; CM-NEXT: MOV T0.Y, PV.X,
+; CM-NEXT: MOV T0.Z, T9.X,
+; CM-NEXT: MOV * T0.W, T3.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV T1.Y, T5.X,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 96(1.345247e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T13.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 96(1.345247e-43)
-; CM-NEXT: LSHR T14.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
-; CM-NEXT: LSHR T15.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; CM-NEXT: LSHR T16.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
-; CM-NEXT: LSHR T17.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: LSHR T14.X, PV.W, literal.x,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; CM-NEXT: LSHR * T15.X, PV.W, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T16.X, KC0[2].Y, literal.x,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 112(1.569454e-43)
+; CM-NEXT: LSHR T17.X, PV.W, literal.x,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; CM-NEXT: LSHR T18.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
-; CM-NEXT: ASHR * T19.W, T11.W, literal.z,
-; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T20.X, PV.Z, literal.x,
-; CM-NEXT: ASHR T19.Z, T11.W, literal.y,
-; CM-NEXT: ASHR * T21.W, T11.Z, literal.z,
-; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T19.X, T11.W, 0.0, literal.x,
-; CM-NEXT: ASHR T21.Z, T11.Z, literal.x,
-; CM-NEXT: ASHR * T22.W, T11.Y, literal.y,
-; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T21.X, T11.Z, 0.0, literal.x,
-; CM-NEXT: ASHR T19.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T22.Z, T11.Y, literal.x,
-; CM-NEXT: ASHR * T11.W, T11.X, literal.y,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
+; CM-NEXT: LSHR * T19.X, PV.W, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: BFE_INT T20.X, T12.W, 0.0, literal.x,
+; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
+; CM-NEXT: ASHR * T21.W, T12.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T22.X, T11.Y, 0.0, literal.x,
-; CM-NEXT: ASHR T21.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T11.Z, T11.X, literal.x,
-; CM-NEXT: ASHR * T23.W, T12.W, literal.y,
+; CM-NEXT: BFE_INT T22.X, T12.Y, 0.0, literal.x,
+; CM-NEXT: ASHR T20.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T21.Z, T12.Z, literal.x,
+; CM-NEXT: ASHR * T12.W, T12.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T11.X, T11.X, 0.0, literal.x,
+; CM-NEXT: BFE_INT T21.X, T12.Z, 0.0, literal.x,
; CM-NEXT: ASHR T22.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T23.Z, T12.W, literal.x,
-; CM-NEXT: ASHR * T24.W, T12.Z, literal.y,
+; CM-NEXT: ASHR T12.Z, T12.X, literal.x,
+; CM-NEXT: ASHR * T20.W, T1.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T23.X, T12.W, 0.0, literal.x,
-; CM-NEXT: ASHR T11.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T24.Z, T12.Z, literal.x,
-; CM-NEXT: ASHR * T25.W, T12.Y, literal.y,
+; CM-NEXT: BFE_INT T12.X, T12.X, 0.0, literal.x,
+; CM-NEXT: ASHR T21.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T20.Z, T1.Y, literal.x,
+; CM-NEXT: ASHR * T22.W, T0.W, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T24.X, T12.Z, 0.0, literal.x,
+; CM-NEXT: BFE_INT T23.X, T11.W, 0.0, literal.x,
+; CM-NEXT: ASHR T12.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T22.Z, T0.W, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: ASHR * T24.W, T11.Z, literal.y,
+; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; CM-NEXT: BFE_INT T25.X, T11.Y, 0.0, literal.x,
; CM-NEXT: ASHR T23.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T25.Z, T12.Y, literal.x,
-; CM-NEXT: ASHR * T12.W, T12.X, literal.y,
+; CM-NEXT: ASHR T24.Z, T11.Z, literal.x,
+; CM-NEXT: ASHR * T11.W, T11.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T25.X, T12.Y, 0.0, literal.x,
-; CM-NEXT: ASHR T24.Y, PV.X, literal.y,
-; CM-NEXT: ASHR * T12.Z, T12.X, literal.x,
+; CM-NEXT: BFE_INT T24.X, T11.Z, 0.0, literal.x,
+; CM-NEXT: ASHR T25.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T11.Z, T11.X, literal.x,
+; CM-NEXT: ASHR * T23.W, T0.Z, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T12.X, T12.X, 0.0, literal.x,
-; CM-NEXT: ASHR * T25.Y, PV.X, literal.y,
+; CM-NEXT: BFE_INT T11.X, T11.X, 0.0, literal.x,
+; CM-NEXT: ASHR T24.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T23.Z, T0.Z, literal.x,
+; CM-NEXT: ASHR * T25.W, T0.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: LSHR T26.X, KC0[2].Y, literal.x,
-; CM-NEXT: ASHR * T12.Y, PV.X, literal.y,
+; CM-NEXT: LSHR T26.X, T1.Z, literal.x,
+; CM-NEXT: ASHR T11.Y, PV.X, literal.y,
+; CM-NEXT: ASHR * T25.Z, T0.Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
%load = load <16 x i16>, ptr addrspace(1) %in
%ext = sext <16 x i16> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -7632,118 +8420,161 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
;
; EG-LABEL: global_zextload_v32i16_to_v32i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 2 @22
-; EG-NEXT: ALU 33, @31, KC0[], KC1[]
+; EG-NEXT: ALU 0, @36, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @28
-; EG-NEXT: ALU 93, @65, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T50.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T49.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T48.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T47.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T46.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T45.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T44.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T43.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T39.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T38.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T37.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T36.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T35.X, 1
+; EG-NEXT: ALU 3, @37, KC0[], KC1[]
+; EG-NEXT: TEX 0 @30
+; EG-NEXT: ALU 3, @41, KC0[], KC1[]
+; EG-NEXT: TEX 0 @32
+; EG-NEXT: ALU 3, @45, KC0[], KC1[]
+; EG-NEXT: TEX 0 @34
+; EG-NEXT: ALU 99, @49, KC0[], KC1[]
+; EG-NEXT: ALU 47, @149, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T50.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T49.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T48.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T47.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T46.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T45.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T44.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T43.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T42.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T41.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T40.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T39.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T38.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T37.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T36.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T35.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 48, #1
-; EG-NEXT: VTX_READ_128 T21.XYZW, T19.X, 16, #1
-; EG-NEXT: VTX_READ_128 T22.XYZW, T19.X, 32, #1
+; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 28:
-; EG-NEXT: VTX_READ_128 T29.XYZW, T19.X, 0, #1
-; EG-NEXT: ALU clause starting at 30:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
+; EG-NEXT: Fetch clause starting at 30:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT: Fetch clause starting at 32:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; EG-NEXT: Fetch clause starting at 34:
+; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; EG-NEXT: ALU clause starting at 36:
; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 31:
-; EG-NEXT: LSHR * T23.Z, T20.Z, literal.x,
+; EG-NEXT: ALU clause starting at 37:
+; EG-NEXT: MOV T14.X, T20.X,
+; EG-NEXT: MOV * T15.X, T20.Y,
+; EG-NEXT: MOV T16.X, T20.Z,
+; EG-NEXT: MOV * T17.X, T20.W,
+; EG-NEXT: ALU clause starting at 41:
+; EG-NEXT: MOV T10.X, T20.X,
+; EG-NEXT: MOV * T11.X, T20.Y,
+; EG-NEXT: MOV T12.X, T20.Z,
+; EG-NEXT: MOV * T13.X, T20.W,
+; EG-NEXT: ALU clause starting at 45:
+; EG-NEXT: MOV T6.X, T20.X,
+; EG-NEXT: MOV * T7.X, T20.Y,
+; EG-NEXT: MOV T8.X, T20.Z,
+; EG-NEXT: MOV * T9.X, T20.W,
+; EG-NEXT: ALU clause starting at 49:
+; EG-NEXT: MOV T2.X, T19.X,
+; EG-NEXT: MOV * T3.X, T19.Y,
+; EG-NEXT: MOV T4.X, T19.Z,
+; EG-NEXT: MOV * T5.X, T19.W,
+; EG-NEXT: MOV T0.Y, T12.X,
+; EG-NEXT: MOV T0.Z, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T6.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T9.X,
+; EG-NEXT: MOV T1.Z, T8.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T3.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T2.X,
+; EG-NEXT: MOV T2.Z, T5.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T2.W, T4.X, BS:VEC_201
+; EG-NEXT: LSHR * T19.Z, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T19.X, T2.W, literal.x,
+; EG-NEXT: MOV T19.Y, 0.0,
+; EG-NEXT: LSHR T20.Z, T2.Z, literal.y,
+; EG-NEXT: AND_INT * T20.X, T2.Z, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: MOV T20.Y, 0.0,
+; EG-NEXT: LSHR * T21.Z, T2.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T23.X, T20.Z, literal.x,
+; EG-NEXT: AND_INT T21.X, T2.Y, literal.x,
+; EG-NEXT: MOV T21.Y, 0.0,
+; EG-NEXT: LSHR T22.Z, T1.W, literal.y,
+; EG-NEXT: AND_INT * T22.X, T1.W, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: MOV T22.Y, 0.0,
+; EG-NEXT: LSHR * T23.Z, T1.Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T23.X, T1.Z, literal.x,
; EG-NEXT: MOV T23.Y, 0.0,
-; EG-NEXT: LSHR T24.Z, T20.W, literal.y,
-; EG-NEXT: AND_INT * T24.X, T20.W, literal.x,
+; EG-NEXT: LSHR T24.Z, T1.Y, literal.y,
+; EG-NEXT: AND_INT * T24.X, T1.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T24.Y, 0.0,
-; EG-NEXT: LSHR * T25.Z, T20.X, literal.x,
+; EG-NEXT: LSHR * T25.Z, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T25.X, T20.X, literal.x,
+; EG-NEXT: AND_INT T25.X, T0.W, literal.x,
; EG-NEXT: MOV T25.Y, 0.0,
-; EG-NEXT: LSHR T20.Z, T20.Y, literal.y,
-; EG-NEXT: AND_INT * T20.X, T20.Y, literal.x,
+; EG-NEXT: LSHR T26.Z, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T26.X, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T20.Y, 0.0,
-; EG-NEXT: LSHR * T26.Z, T22.Z, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T26.X, T22.Z, literal.x,
; EG-NEXT: MOV T26.Y, 0.0,
-; EG-NEXT: LSHR T27.Z, T22.W, literal.y,
-; EG-NEXT: AND_INT * T27.X, T22.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T27.Y, 0.0,
-; EG-NEXT: LSHR * T28.Z, T22.X, literal.x,
+; EG-NEXT: LSHR T27.Z, T0.Y, literal.x,
+; EG-NEXT: MOV * T0.Z, T15.X,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T28.X, T22.X, literal.x,
-; EG-NEXT: MOV T28.Y, 0.0,
-; EG-NEXT: LSHR T22.Z, T22.Y, literal.y,
-; EG-NEXT: AND_INT * T22.X, T22.Y, literal.x,
+; EG-NEXT: MOV * T0.W, T14.X,
+; EG-NEXT: MOV T1.Y, T17.X,
+; EG-NEXT: MOV T1.Z, T16.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T11.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T10.X,
+; EG-NEXT: MOV * T2.Z, T13.X, BS:VEC_120/SCL_212
+; EG-NEXT: AND_INT T27.X, T0.Y, literal.x,
+; EG-NEXT: MOV T27.Y, 0.0,
+; EG-NEXT: LSHR T28.Z, PV.Z, literal.y,
+; EG-NEXT: AND_INT * T28.X, PV.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T22.Y, 0.0,
-; EG-NEXT: LSHR * T19.Z, T21.Z, literal.x,
+; EG-NEXT: MOV T28.Y, 0.0,
+; EG-NEXT: LSHR * T29.Z, T2.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 65:
-; EG-NEXT: AND_INT T19.X, T21.Z, literal.x,
-; EG-NEXT: MOV T19.Y, 0.0,
-; EG-NEXT: LSHR T30.Z, T21.W, literal.y,
-; EG-NEXT: AND_INT * T30.X, T21.W, literal.x,
+; EG-NEXT: AND_INT T29.X, T2.Y, literal.x,
+; EG-NEXT: MOV T29.Y, 0.0,
+; EG-NEXT: LSHR T30.Z, T1.W, literal.y,
+; EG-NEXT: AND_INT * T30.X, T1.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T30.Y, 0.0,
-; EG-NEXT: LSHR * T31.Z, T21.X, literal.x,
+; EG-NEXT: LSHR * T31.Z, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T31.X, T21.X, literal.x,
+; EG-NEXT: AND_INT T31.X, T1.Z, literal.x,
; EG-NEXT: MOV T31.Y, 0.0,
-; EG-NEXT: LSHR T21.Z, T21.Y, literal.y,
-; EG-NEXT: AND_INT * T21.X, T21.Y, literal.x,
+; EG-NEXT: LSHR T32.Z, T1.Y, literal.y,
+; EG-NEXT: AND_INT * T32.X, T1.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T21.Y, 0.0,
-; EG-NEXT: LSHR * T32.Z, T29.Z, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T32.X, T29.Z, literal.x,
; EG-NEXT: MOV T32.Y, 0.0,
-; EG-NEXT: LSHR T33.Z, T29.W, literal.y,
-; EG-NEXT: AND_INT * T33.X, T29.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T33.Y, 0.0,
-; EG-NEXT: LSHR * T34.Z, T29.X, literal.x,
+; EG-NEXT: LSHR * T33.Z, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T34.X, T29.X, literal.x,
-; EG-NEXT: MOV T34.Y, 0.0,
-; EG-NEXT: LSHR T29.Z, T29.Y, literal.y,
-; EG-NEXT: AND_INT * T29.X, T29.Y, literal.x,
+; EG-NEXT: AND_INT T33.X, T0.W, literal.x,
+; EG-NEXT: MOV T33.Y, 0.0,
+; EG-NEXT: LSHR T34.Z, T0.Z, literal.y,
+; EG-NEXT: AND_INT * T34.X, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: MOV T29.Y, 0.0,
+; EG-NEXT: MOV T34.Y, 0.0,
+; EG-NEXT: MOV T19.W, 0.0,
+; EG-NEXT: MOV * T20.W, 0.0,
+; EG-NEXT: MOV T21.W, 0.0,
+; EG-NEXT: MOV * T22.W, 0.0,
; EG-NEXT: MOV T23.W, 0.0,
; EG-NEXT: MOV * T24.W, 0.0,
; EG-NEXT: MOV T25.W, 0.0,
-; EG-NEXT: MOV * T20.W, 0.0,
-; EG-NEXT: MOV T26.W, 0.0,
-; EG-NEXT: MOV * T27.W, 0.0,
-; EG-NEXT: MOV T28.W, 0.0,
-; EG-NEXT: MOV * T22.W, 0.0,
-; EG-NEXT: MOV T19.W, 0.0,
+; EG-NEXT: MOV * T26.W, 0.0,
+; EG-NEXT: MOV T27.W, 0.0,
+; EG-NEXT: MOV * T28.W, 0.0,
+; EG-NEXT: MOV T29.W, 0.0,
; EG-NEXT: MOV * T30.W, 0.0,
; EG-NEXT: MOV T31.W, 0.0,
-; EG-NEXT: MOV * T21.W, 0.0,
-; EG-NEXT: MOV T32.W, 0.0,
-; EG-NEXT: MOV * T33.W, 0.0,
-; EG-NEXT: MOV T34.W, 0.0,
-; EG-NEXT: MOV * T29.W, 0.0,
+; EG-NEXT: MOV * T32.W, 0.0,
+; EG-NEXT: MOV T33.W, 0.0,
+; EG-NEXT: MOV * T34.W, 0.0,
+; EG-NEXT: ALU clause starting at 149:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T35.X, PV.W, literal.x,
@@ -7795,122 +8626,165 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
;
; CM-LABEL: global_zextload_v32i16_to_v32i64:
; CM: ; %bb.0:
-; CM-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 2 @22
-; CM-NEXT: ALU 33, @31, KC0[], KC1[]
+; CM-NEXT: ALU 0, @36, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @28
-; CM-NEXT: ALU 94, @65, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 3, @37, KC0[], KC1[]
+; CM-NEXT: TEX 0 @30
+; CM-NEXT: ALU 3, @41, KC0[], KC1[]
+; CM-NEXT: TEX 0 @32
+; CM-NEXT: ALU 3, @45, KC0[], KC1[]
+; CM-NEXT: TEX 0 @34
+; CM-NEXT: ALU 100, @49, KC0[], KC1[]
+; CM-NEXT: ALU 47, @150, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T50.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T49.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T48.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T47.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T46.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T45.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T44.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T29, T43.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T42.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T41.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T40.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T39.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T38.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T37.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T34, T36.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T23.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T49.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T48.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T47.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T46.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T45.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T44.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T43.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T42.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T41.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T29, T40.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T39.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T38.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T37.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T36.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T34, T35.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 22:
-; CM-NEXT: VTX_READ_128 T21.XYZW, T20.X, 0, #1
-; CM-NEXT: VTX_READ_128 T22.XYZW, T20.X, 32, #1
-; CM-NEXT: VTX_READ_128 T23.XYZW, T20.X, 16, #1
+; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 28:
-; CM-NEXT: VTX_READ_128 T23.XYZW, T20.X, 48, #1
-; CM-NEXT: ALU clause starting at 30:
-; CM-NEXT: MOV * T20.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 31:
-; CM-NEXT: LSHR * T19.Z, T21.Y, literal.x,
+; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
+; CM-NEXT: Fetch clause starting at 30:
+; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; CM-NEXT: Fetch clause starting at 32:
+; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
+; CM-NEXT: Fetch clause starting at 34:
+; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 48, #1
+; CM-NEXT: ALU clause starting at 36:
+; CM-NEXT: MOV * T19.X, KC0[2].Z,
+; CM-NEXT: ALU clause starting at 37:
+; CM-NEXT: MOV * T14.X, T20.X,
+; CM-NEXT: MOV * T15.X, T20.Y,
+; CM-NEXT: MOV * T16.X, T20.Z,
+; CM-NEXT: MOV * T17.X, T20.W,
+; CM-NEXT: ALU clause starting at 41:
+; CM-NEXT: MOV * T10.X, T20.X,
+; CM-NEXT: MOV * T11.X, T20.Y,
+; CM-NEXT: MOV * T12.X, T20.Z,
+; CM-NEXT: MOV * T13.X, T20.W,
+; CM-NEXT: ALU clause starting at 45:
+; CM-NEXT: MOV * T6.X, T20.X,
+; CM-NEXT: MOV * T7.X, T20.Y,
+; CM-NEXT: MOV * T8.X, T20.Z,
+; CM-NEXT: MOV * T9.X, T20.W,
+; CM-NEXT: ALU clause starting at 49:
+; CM-NEXT: MOV * T2.X, T19.X,
+; CM-NEXT: MOV * T3.X, T19.Y,
+; CM-NEXT: MOV * T4.X, T19.Z,
+; CM-NEXT: MOV T5.X, T19.W,
+; CM-NEXT: MOV T0.Y, T7.X,
+; CM-NEXT: MOV T0.Z, T12.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T0.W, T13.X, BS:VEC_201
+; CM-NEXT: MOV T1.Y, T10.X,
+; CM-NEXT: MOV T1.Z, T11.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T1.W, T16.X, BS:VEC_201
+; CM-NEXT: MOV T2.Y, T17.X,
+; CM-NEXT: MOV T2.Z, T14.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T2.W, T15.X, BS:VEC_201
+; CM-NEXT: LSHR * T19.Z, PV.W, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T19.X, T21.Y, literal.x,
+; CM-NEXT: AND_INT T19.X, T2.W, literal.x,
; CM-NEXT: MOV T19.Y, 0.0,
-; CM-NEXT: LSHR * T24.Z, T21.X, literal.y,
+; CM-NEXT: LSHR * T20.Z, T2.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T24.X, T21.X, literal.x,
+; CM-NEXT: AND_INT T20.X, T2.Z, literal.x,
+; CM-NEXT: MOV T20.Y, 0.0,
+; CM-NEXT: LSHR * T21.Z, T2.Y, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: AND_INT T21.X, T2.Y, literal.x,
+; CM-NEXT: MOV T21.Y, 0.0,
+; CM-NEXT: LSHR * T22.Z, T1.W, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: AND_INT T22.X, T1.W, literal.x,
+; CM-NEXT: MOV T22.Y, 0.0,
+; CM-NEXT: LSHR * T23.Z, T1.Z, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: AND_INT T23.X, T1.Z, literal.x,
+; CM-NEXT: MOV T23.Y, 0.0,
+; CM-NEXT: LSHR * T24.Z, T1.Y, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: AND_INT T24.X, T1.Y, literal.x,
; CM-NEXT: MOV T24.Y, 0.0,
-; CM-NEXT: LSHR * T25.Z, T21.W, literal.y,
+; CM-NEXT: LSHR * T25.Z, T0.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T25.X, T21.W, literal.x,
+; CM-NEXT: AND_INT T25.X, T0.W, literal.x,
; CM-NEXT: MOV T25.Y, 0.0,
-; CM-NEXT: LSHR * T26.Z, T21.Z, literal.y,
+; CM-NEXT: LSHR * T26.Z, T0.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T26.X, T21.Z, literal.x,
+; CM-NEXT: AND_INT T26.X, T0.Z, literal.x,
; CM-NEXT: MOV T26.Y, 0.0,
-; CM-NEXT: LSHR * T21.Z, T23.Y, literal.y,
+; CM-NEXT: LSHR * T27.Z, T0.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T21.X, T23.Y, literal.x,
-; CM-NEXT: MOV T21.Y, 0.0,
-; CM-NEXT: LSHR * T27.Z, T23.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T27.X, T23.X, literal.x,
+; CM-NEXT: MOV T0.Z, T4.X,
+; CM-NEXT: MOV * T0.W, T5.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV T1.Y, T2.X,
+; CM-NEXT: MOV T1.Z, T3.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T1.W, T8.X, BS:VEC_201
+; CM-NEXT: MOV T2.Y, T9.X,
+; CM-NEXT: MOV * T2.Z, T6.X, BS:VEC_120/SCL_212
+; CM-NEXT: AND_INT T27.X, T0.Y, literal.x,
; CM-NEXT: MOV T27.Y, 0.0,
-; CM-NEXT: LSHR * T28.Z, T23.W, literal.y,
+; CM-NEXT: LSHR * T28.Z, PV.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T28.X, T23.W, literal.x,
+; CM-NEXT: AND_INT T28.X, T2.Z, literal.x,
; CM-NEXT: MOV T28.Y, 0.0,
-; CM-NEXT: LSHR * T29.Z, T23.Z, literal.y,
+; CM-NEXT: LSHR * T29.Z, T2.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T29.X, T23.Z, literal.x,
+; CM-NEXT: AND_INT T29.X, T2.Y, literal.x,
; CM-NEXT: MOV T29.Y, 0.0,
-; CM-NEXT: LSHR * T20.Z, T22.Y, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: ALU clause starting at 65:
-; CM-NEXT: AND_INT T20.X, T22.Y, literal.x,
-; CM-NEXT: MOV T20.Y, 0.0,
-; CM-NEXT: LSHR * T30.Z, T22.X, literal.y,
+; CM-NEXT: LSHR * T30.Z, T1.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T30.X, T22.X, literal.x,
+; CM-NEXT: AND_INT T30.X, T1.W, literal.x,
; CM-NEXT: MOV T30.Y, 0.0,
-; CM-NEXT: LSHR * T31.Z, T22.W, literal.y,
+; CM-NEXT: LSHR * T31.Z, T1.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T31.X, T22.W, literal.x,
+; CM-NEXT: AND_INT T31.X, T1.Z, literal.x,
; CM-NEXT: MOV T31.Y, 0.0,
-; CM-NEXT: LSHR * T32.Z, T22.Z, literal.y,
+; CM-NEXT: LSHR * T32.Z, T1.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T32.X, T22.Z, literal.x,
+; CM-NEXT: AND_INT T32.X, T1.Y, literal.x,
; CM-NEXT: MOV T32.Y, 0.0,
-; CM-NEXT: LSHR * T22.Z, T23.Y, literal.y,
+; CM-NEXT: LSHR * T33.Z, T0.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T22.X, T23.Y, literal.x,
-; CM-NEXT: MOV T22.Y, 0.0,
-; CM-NEXT: LSHR * T33.Z, T23.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T33.X, T23.X, literal.x,
+; CM-NEXT: AND_INT T33.X, T0.W, literal.x,
; CM-NEXT: MOV T33.Y, 0.0,
-; CM-NEXT: LSHR * T34.Z, T23.W, literal.y,
+; CM-NEXT: LSHR * T34.Z, T0.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T34.X, T23.W, literal.x,
+; CM-NEXT: AND_INT T34.X, T0.Z, literal.x,
; CM-NEXT: MOV T34.Y, 0.0,
-; CM-NEXT: LSHR * T35.Z, T23.Z, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: AND_INT T35.X, T23.Z, literal.x,
-; CM-NEXT: MOV T35.Y, 0.0,
; CM-NEXT: MOV * T19.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: MOV * T20.W, 0.0,
+; CM-NEXT: MOV * T21.W, 0.0,
+; CM-NEXT: MOV * T22.W, 0.0,
+; CM-NEXT: MOV * T23.W, 0.0,
; CM-NEXT: MOV * T24.W, 0.0,
; CM-NEXT: MOV * T25.W, 0.0,
; CM-NEXT: MOV * T26.W, 0.0,
-; CM-NEXT: MOV * T21.W, 0.0,
; CM-NEXT: MOV * T27.W, 0.0,
; CM-NEXT: MOV * T28.W, 0.0,
; CM-NEXT: MOV * T29.W, 0.0,
-; CM-NEXT: MOV * T20.W, 0.0,
; CM-NEXT: MOV * T30.W, 0.0,
; CM-NEXT: MOV * T31.W, 0.0,
; CM-NEXT: MOV * T32.W, 0.0,
-; CM-NEXT: MOV * T22.W, 0.0,
; CM-NEXT: MOV * T33.W, 0.0,
; CM-NEXT: MOV * T34.W, 0.0,
-; CM-NEXT: MOV * T35.W, 0.0,
+; CM-NEXT: ALU clause starting at 150:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T23.X, PV.W, literal.x,
+; CM-NEXT: LSHR T35.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; CM-NEXT: LSHR T36.X, PV.W, literal.x,
@@ -8372,335 +9246,392 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
;
; EG-LABEL: global_sextload_v32i16_to_v32i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @22
-; EG-NEXT: ALU 56, @31, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 2 @24
-; EG-NEXT: ALU 74, @88, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T38.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T36.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T34.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T33.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T32.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T31.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T30.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T29.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T28.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T27.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T24.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T21.X, 1
+; EG-NEXT: ALU 0, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @28
+; EG-NEXT: ALU 1, @37, KC0[], KC1[]
+; EG-NEXT: TEX 0 @30
+; EG-NEXT: ALU 1, @39, KC0[], KC1[]
+; EG-NEXT: TEX 0 @32
+; EG-NEXT: ALU 1, @41, KC0[], KC1[]
+; EG-NEXT: TEX 0 @34
+; EG-NEXT: ALU 87, @43, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 54, @131, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T37.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T36.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T35.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T34.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T33.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T32.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T31.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T30.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T29.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T28.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T27.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T23.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
-; EG-NEXT: Fetch clause starting at 24:
-; EG-NEXT: VTX_READ_128 T38.XYZW, T19.X, 48, #1
-; EG-NEXT: VTX_READ_128 T39.XYZW, T19.X, 32, #1
-; EG-NEXT: VTX_READ_128 T40.XYZW, T19.X, 16, #1
-; EG-NEXT: ALU clause starting at 30:
-; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 31:
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 28:
+; EG-NEXT: VTX_READ_128 T19.XYZW, T22.X, 48, #1
+; EG-NEXT: Fetch clause starting at 30:
+; EG-NEXT: VTX_READ_128 T20.XYZW, T22.X, 32, #1
+; EG-NEXT: Fetch clause starting at 32:
+; EG-NEXT: VTX_READ_128 T21.XYZW, T22.X, 16, #1
+; EG-NEXT: Fetch clause starting at 34:
+; EG-NEXT: VTX_READ_128 T22.XYZW, T22.X, 0, #1
+; EG-NEXT: ALU clause starting at 36:
+; EG-NEXT: MOV * T22.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 37:
+; EG-NEXT: MOV T5.X, T19.W,
+; EG-NEXT: MOV * T3.X, T19.Y,
+; EG-NEXT: ALU clause starting at 39:
+; EG-NEXT: MOV T9.X, T20.W,
+; EG-NEXT: MOV * T7.X, T20.Y,
+; EG-NEXT: ALU clause starting at 41:
+; EG-NEXT: MOV T13.X, T21.W,
+; EG-NEXT: MOV * T11.X, T21.Y,
+; EG-NEXT: ALU clause starting at 43:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T21.X, PV.W, literal.x,
-; EG-NEXT: LSHR * T22.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T23.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR T24.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T25.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR T26.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
+; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; EG-NEXT: LSHR T27.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
+; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; EG-NEXT: LSHR T28.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
+; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
; EG-NEXT: LSHR T29.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
+; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; EG-NEXT: LSHR T30.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT: LSHR T31.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
+; EG-NEXT: LSHR * T31.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T32.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
+; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; EG-NEXT: LSHR T33.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
+; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; EG-NEXT: LSHR T34.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: ASHR * T35.W, T20.Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
+; EG-NEXT: LSHR T35.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
; EG-NEXT: LSHR T36.X, PV.W, literal.x,
-; EG-NEXT: ASHR T35.Z, T20.Y, literal.y,
-; EG-NEXT: ASHR * T37.W, T20.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T35.X, T20.Y, 0.0, literal.x,
-; EG-NEXT: ASHR * T37.Z, T20.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T37.X, T20.X, 0.0, literal.x,
-; EG-NEXT: ASHR T35.Y, PV.X, literal.y,
-; EG-NEXT: ASHR * T19.W, T20.W, literal.y,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
+; EG-NEXT: LSHR * T37.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T38.X, T22.W, 0.0, literal.x,
+; EG-NEXT: ASHR * T39.W, T22.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: ALU clause starting at 88:
-; EG-NEXT: ASHR T19.Z, T20.W, literal.x,
-; EG-NEXT: ASHR * T41.W, T20.Z, literal.y,
+; EG-NEXT: BFE_INT T40.X, T22.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T38.Y, PV.X, literal.y,
+; EG-NEXT: ASHR * T39.Z, T22.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T19.X, T20.W, 0.0, literal.x,
-; EG-NEXT: ASHR T37.Y, T37.X, literal.y,
-; EG-NEXT: ASHR T41.Z, T20.Z, literal.x,
-; EG-NEXT: ASHR * T20.W, T40.Y, literal.y,
+; EG-NEXT: BFE_INT T39.X, T22.X, 0.0, literal.x,
+; EG-NEXT: ASHR T40.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T41.W, T22.Z, literal.y,
+; EG-NEXT: MOV * T17.X, T22.W,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T41.X, T20.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T19.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T20.Z, T40.Y, literal.x,
-; EG-NEXT: ASHR * T42.W, T40.X, literal.y,
+; EG-NEXT: MOV T15.X, T22.Y,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: MOV T0.Z, T3.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T0.W, T9.X, BS:VEC_201
+; EG-NEXT: MOV T1.Y, T7.X,
+; EG-NEXT: MOV T1.Z, T13.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T1.W, T11.X, BS:VEC_201
+; EG-NEXT: MOV T2.Y, T17.X,
+; EG-NEXT: MOV T2.Z, T15.X, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T41.Z, T22.Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR * T40.W, PV.Z, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T41.X, T22.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T39.Y, T39.X, literal.y,
+; EG-NEXT: ASHR T40.Z, T2.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T38.W, T2.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T20.X, T40.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T22.X, T21.Y, 0.0, literal.x,
; EG-NEXT: ASHR T41.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T42.Z, T40.X, literal.x,
-; EG-NEXT: ASHR * T43.W, T40.W, literal.y,
+; EG-NEXT: ASHR T38.Z, T2.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T42.W, T21.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T42.X, T40.X, 0.0, literal.x,
-; EG-NEXT: ASHR T20.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T43.Z, T40.W, literal.x,
-; EG-NEXT: ASHR * T44.W, T40.Z, literal.y,
+; EG-NEXT: BFE_INT T43.X, T21.W, 0.0, literal.x,
+; EG-NEXT: ASHR T22.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T42.Z, T21.X, literal.x,
+; EG-NEXT: ASHR * T44.W, T21.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T43.X, T40.W, 0.0, literal.x,
-; EG-NEXT: ASHR T42.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T44.Z, T40.Z, literal.x,
-; EG-NEXT: ASHR * T40.W, T39.Y, literal.y,
+; EG-NEXT: BFE_INT T42.X, T21.X, 0.0, literal.x,
+; EG-NEXT: ASHR * T43.Y, PV.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T44.X, T40.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T43.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T40.Z, T39.Y, literal.x,
-; EG-NEXT: ASHR * T45.W, T39.X, literal.y,
+; EG-NEXT: ALU clause starting at 131:
+; EG-NEXT: ASHR T44.Z, T21.Z, literal.x,
+; EG-NEXT: ASHR * T22.W, T1.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T40.X, T39.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T44.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T45.Z, T39.X, literal.x,
-; EG-NEXT: ASHR * T46.W, T39.W, literal.y,
+; EG-NEXT: BFE_INT T44.X, T21.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T42.Y, T42.X, literal.y,
+; EG-NEXT: ASHR T22.Z, T1.W, literal.x,
+; EG-NEXT: ASHR * T43.W, T1.Z, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T45.X, T39.X, 0.0, literal.x,
-; EG-NEXT: ASHR T40.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T46.Z, T39.W, literal.x,
-; EG-NEXT: ASHR * T47.W, T39.Z, literal.y,
+; EG-NEXT: BFE_INT T21.X, T20.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T44.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T43.Z, T1.Z, literal.x,
+; EG-NEXT: ASHR * T45.W, T20.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T46.X, T39.W, 0.0, literal.x,
-; EG-NEXT: ASHR T45.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T47.Z, T39.Z, literal.x,
-; EG-NEXT: ASHR * T39.W, T38.Y, literal.y,
+; EG-NEXT: BFE_INT T46.X, T20.W, 0.0, literal.x,
+; EG-NEXT: ASHR T21.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T45.Z, T20.X, literal.x,
+; EG-NEXT: ASHR * T47.W, T20.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T47.X, T39.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T45.X, T20.X, 0.0, literal.x,
; EG-NEXT: ASHR T46.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T39.Z, T38.Y, literal.x,
-; EG-NEXT: ASHR * T48.W, T38.X, literal.y,
+; EG-NEXT: ASHR T47.Z, T20.Z, literal.x,
+; EG-NEXT: ASHR * T21.W, T1.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T39.X, T38.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T47.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T48.Z, T38.X, literal.x,
-; EG-NEXT: ASHR * T49.W, T38.W, literal.y,
+; EG-NEXT: BFE_INT T47.X, T20.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T45.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T21.Z, T1.Y, literal.x,
+; EG-NEXT: ASHR * T46.W, T0.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T48.X, T38.X, 0.0, literal.x,
-; EG-NEXT: ASHR T39.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T49.Z, T38.W, literal.x,
-; EG-NEXT: ASHR * T50.W, T38.Z, literal.y,
+; EG-NEXT: BFE_INT T20.X, T19.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T47.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T46.Z, T0.W, literal.x,
+; EG-NEXT: ASHR * T48.W, T19.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T49.X, T38.W, 0.0, literal.x,
-; EG-NEXT: ASHR T48.Y, PV.X, literal.y,
-; EG-NEXT: ASHR * T50.Z, T38.Z, literal.x,
+; EG-NEXT: BFE_INT T49.X, T19.W, 0.0, literal.x,
+; EG-NEXT: ASHR T20.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T48.Z, T19.X, literal.x,
+; EG-NEXT: ASHR * T50.W, T19.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T50.X, T38.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T48.X, T19.X, 0.0, literal.x,
; EG-NEXT: ASHR T49.Y, PV.X, literal.y,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR T50.Z, T19.Z, literal.x,
+; EG-NEXT: ASHR * T20.W, T0.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: BFE_INT T50.X, T19.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T48.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T20.Z, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR * T49.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T38.X, PV.W, literal.x,
-; EG-NEXT: ASHR * T50.Y, PV.X, literal.y,
+; EG-NEXT: LSHR T19.X, PV.W, literal.x,
+; EG-NEXT: ASHR T50.Y, PV.X, literal.y,
+; EG-NEXT: ASHR * T49.Z, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_v32i16_to_v32i64:
; CM: ; %bb.0:
-; CM-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 0 @22
-; CM-NEXT: ALU 55, @31, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 2 @24
-; CM-NEXT: ALU 73, @87, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T38, T50.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T49, T36.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T48, T34.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T47, T33.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T39, T32.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T46, T31.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T30.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T44, T29.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T40, T28.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T43, T27.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T42, T26.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T41, T25.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T24.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T23.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T22.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T21.X
+; CM-NEXT: ALU 0, @36, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 0 @28
+; CM-NEXT: ALU 1, @37, KC0[], KC1[]
+; CM-NEXT: TEX 0 @30
+; CM-NEXT: ALU 1, @39, KC0[], KC1[]
+; CM-NEXT: TEX 0 @32
+; CM-NEXT: ALU 1, @41, KC0[], KC1[]
+; CM-NEXT: TEX 0 @34
+; CM-NEXT: ALU 87, @43, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 53, @131, KC0[], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T49, T50.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T47, T38.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T46, T36.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T44, T35.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T43, T34.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T41, T33.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T40, T32.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T39, T31.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T30.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T48, T29.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T28.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T27.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T26.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T42, T25.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T24.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T23.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 22:
-; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 48, #1
-; CM-NEXT: Fetch clause starting at 24:
-; CM-NEXT: VTX_READ_128 T38.XYZW, T19.X, 0, #1
-; CM-NEXT: VTX_READ_128 T39.XYZW, T19.X, 16, #1
-; CM-NEXT: VTX_READ_128 T40.XYZW, T19.X, 32, #1
-; CM-NEXT: ALU clause starting at 30:
+; CM-NEXT: PAD
+; CM-NEXT: Fetch clause starting at 28:
+; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 48, #1
+; CM-NEXT: Fetch clause starting at 30:
+; CM-NEXT: VTX_READ_128 T21.XYZW, T19.X, 32, #1
+; CM-NEXT: Fetch clause starting at 32:
+; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; CM-NEXT: Fetch clause starting at 34:
+; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
+; CM-NEXT: ALU clause starting at 36:
; CM-NEXT: MOV * T19.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 31:
+; CM-NEXT: ALU clause starting at 37:
+; CM-NEXT: MOV * T5.X, T22.W,
+; CM-NEXT: MOV * T3.X, T22.Y,
+; CM-NEXT: ALU clause starting at 39:
+; CM-NEXT: MOV * T9.X, T21.W,
+; CM-NEXT: MOV * T7.X, T21.Y,
+; CM-NEXT: ALU clause starting at 41:
+; CM-NEXT: MOV * T13.X, T20.W,
+; CM-NEXT: MOV * T11.X, T20.Y,
+; CM-NEXT: ALU clause starting at 43:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T21.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43)
-; CM-NEXT: LSHR T22.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 192(2.690493e-43)
; CM-NEXT: LSHR T23.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 208(2.914701e-43)
+; CM-NEXT: 2(2.802597e-45), 192(2.690493e-43)
; CM-NEXT: LSHR T24.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 160(2.242078e-43)
; CM-NEXT: LSHR T25.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 176(2.466285e-43)
+; CM-NEXT: 2(2.802597e-45), 128(1.793662e-43)
; CM-NEXT: LSHR T26.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 128(1.793662e-43)
+; CM-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; CM-NEXT: LSHR T27.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 144(2.017870e-43)
+; CM-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; CM-NEXT: LSHR T28.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 96(1.345247e-43)
-; CM-NEXT: LSHR T29.X, PV.W, literal.x,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 112(1.569454e-43)
-; CM-NEXT: LSHR T30.X, PV.W, literal.x,
+; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; CM-NEXT: LSHR * T29.X, PV.W, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T30.X, KC0[2].Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; CM-NEXT: LSHR T31.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+; CM-NEXT: 2(2.802597e-45), 208(2.914701e-43)
; CM-NEXT: LSHR T32.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
+; CM-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; CM-NEXT: LSHR T33.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
+; CM-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; CM-NEXT: LSHR T34.X, PV.W, literal.x,
-; CM-NEXT: ASHR * T35.W, T20.Z, literal.y,
-; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
-; CM-NEXT: LSHR T36.X, KC0[2].Y, literal.x,
-; CM-NEXT: ASHR T35.Z, T20.Z, literal.y,
-; CM-NEXT: ASHR * T37.W, T20.W, literal.z,
-; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 112(1.569454e-43)
+; CM-NEXT: LSHR T35.X, PV.W, literal.x,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+; CM-NEXT: LSHR T36.X, PV.W, literal.x,
+; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
+; CM-NEXT: ASHR * T37.W, T22.Z, literal.z,
+; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T35.X, T20.Z, 0.0, literal.x,
-; CM-NEXT: ASHR * T37.Z, T20.W, literal.x,
+; CM-NEXT: LSHR T38.X, PV.Z, literal.x,
+; CM-NEXT: ASHR * T37.Z, T22.Z, literal.y,
+; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT: BFE_INT * T37.X, T22.Z, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT T37.X, T20.W, 0.0, literal.x,
-; CM-NEXT: ASHR T35.Y, PV.X, literal.y,
-; CM-NEXT: ASHR * T19.W, T20.X, literal.y,
-; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: ALU clause starting at 87:
-; CM-NEXT: ASHR T19.Z, T20.X, literal.x,
-; CM-NEXT: ASHR * T20.W, T20.Y, literal.y,
+; CM-NEXT: BFE_INT * T39.X, T22.W, 0.0, literal.x,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: BFE_INT T40.X, T22.Y, 0.0, literal.x,
+; CM-NEXT: ASHR T39.Y, PV.X, literal.y,
+; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
+; CM-NEXT: ASHR * T22.W, T22.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T19.X, T20.X, 0.0, literal.x,
+; CM-NEXT: MOV * T17.X, T19.W,
+; CM-NEXT: MOV * T15.X, T19.Y,
+; CM-NEXT: MOV T0.Y, PV.X,
+; CM-NEXT: MOV * T0.W, T17.X,
+; CM-NEXT: MOV T1.Y, T11.X,
+; CM-NEXT: MOV T1.Z, T13.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T1.W, T7.X, BS:VEC_201
+; CM-NEXT: MOV T2.Y, T9.X,
+; CM-NEXT: MOV T2.Z, T3.X, BS:VEC_120/SCL_212
+; CM-NEXT: MOV * T2.W, T5.X, BS:VEC_201
+; CM-NEXT: ASHR T40.Y, T40.X, literal.x,
+; CM-NEXT: ASHR T22.Z, T22.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT: ASHR * T39.W, PV.W, literal.x,
+; CM-NEXT: 31(4.344025e-44), 16(2.242078e-44)
+; CM-NEXT: BFE_INT T22.X, T22.X, 0.0, literal.x,
; CM-NEXT: ASHR T37.Y, T37.X, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT: ASHR T20.Z, T20.Y, literal.x,
-; CM-NEXT: ASHR * T41.W, T40.Z, literal.y,
-; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T20.X, T20.Y, 0.0, literal.x,
-; CM-NEXT: ASHR T19.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T41.Z, T40.Z, literal.x,
-; CM-NEXT: ASHR * T42.W, T40.W, literal.y,
+; CM-NEXT: ASHR T39.Z, T2.W, literal.x,
+; CM-NEXT: ASHR * T40.W, T2.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T41.X, T40.Z, 0.0, literal.x,
-; CM-NEXT: ASHR T20.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T42.Z, T40.W, literal.x,
-; CM-NEXT: ASHR * T43.W, T40.X, literal.y,
+; CM-NEXT: BFE_INT T41.X, T21.W, 0.0, literal.x,
+; CM-NEXT: ASHR T22.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T40.Z, T2.Z, literal.x,
+; CM-NEXT: ASHR * T42.W, T21.Z, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T42.X, T40.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T43.X, T21.Y, 0.0, literal.x,
; CM-NEXT: ASHR T41.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T43.Z, T40.X, literal.x,
-; CM-NEXT: ASHR * T40.W, T40.Y, literal.y,
-; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T43.X, T40.X, 0.0, literal.x,
-; CM-NEXT: ASHR T42.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T40.Z, T40.Y, literal.x,
-; CM-NEXT: ASHR * T44.W, T39.Z, literal.y,
+; CM-NEXT: ASHR T42.Z, T21.Z, literal.x,
+; CM-NEXT: ASHR * T21.W, T21.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T40.X, T40.Y, 0.0, literal.x,
-; CM-NEXT: ASHR T43.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T44.Z, T39.Z, literal.x,
-; CM-NEXT: ASHR * T45.W, T39.W, literal.y,
+; CM-NEXT: BFE_INT * T42.X, T21.Z, 0.0, literal.x,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: ALU clause starting at 131:
+; CM-NEXT: ASHR T43.Y, T43.X, literal.x,
+; CM-NEXT: ASHR T21.Z, T21.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT: ASHR * T41.W, T2.Y, literal.x,
+; CM-NEXT: 31(4.344025e-44), 16(2.242078e-44)
+; CM-NEXT: BFE_INT T21.X, T21.X, 0.0, literal.x,
+; CM-NEXT: ASHR T42.Y, T42.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT: ASHR T41.Z, T2.Y, literal.x,
+; CM-NEXT: ASHR * T43.W, T1.W, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T44.X, T39.Z, 0.0, literal.x,
-; CM-NEXT: ASHR T40.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T45.Z, T39.W, literal.x,
-; CM-NEXT: ASHR * T46.W, T39.X, literal.y,
+; CM-NEXT: BFE_INT T44.X, T20.W, 0.0, literal.x,
+; CM-NEXT: ASHR T21.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T43.Z, T1.W, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: ASHR * T45.W, T20.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T45.X, T39.W, 0.0, literal.x,
+; CM-NEXT: BFE_INT T46.X, T20.Y, 0.0, literal.x,
; CM-NEXT: ASHR T44.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T46.Z, T39.X, literal.x,
-; CM-NEXT: ASHR * T39.W, T39.Y, literal.y,
+; CM-NEXT: ASHR T45.Z, T20.Z, literal.x,
+; CM-NEXT: ASHR * T20.W, T20.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T46.X, T39.X, 0.0, literal.x,
-; CM-NEXT: ASHR T45.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T39.Z, T39.Y, literal.x,
-; CM-NEXT: ASHR * T47.W, T38.Z, literal.y,
-; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T39.X, T39.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T45.X, T20.Z, 0.0, literal.x,
; CM-NEXT: ASHR T46.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T47.Z, T38.Z, literal.x,
-; CM-NEXT: ASHR * T48.W, T38.W, literal.y,
+; CM-NEXT: ASHR T20.Z, T20.X, literal.x,
+; CM-NEXT: ASHR * T44.W, T1.Z, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T47.X, T38.Z, 0.0, literal.x,
-; CM-NEXT: ASHR T39.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T48.Z, T38.W, literal.x,
-; CM-NEXT: ASHR * T49.W, T38.X, literal.y,
+; CM-NEXT: BFE_INT T20.X, T20.X, 0.0, literal.x,
+; CM-NEXT: ASHR T45.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T44.Z, T1.Z, literal.x,
+; CM-NEXT: ASHR * T46.W, T1.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T48.X, T38.W, 0.0, literal.x,
-; CM-NEXT: ASHR T47.Y, PV.X, literal.y,
-; CM-NEXT: ASHR T49.Z, T38.X, literal.x,
-; CM-NEXT: ASHR * T38.W, T38.Y, literal.y,
+; CM-NEXT: BFE_INT T47.X, T19.W, 0.0, literal.x,
+; CM-NEXT: ASHR T20.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T46.Z, T1.Y, literal.x,
+; CM-NEXT: ASHR * T48.W, T19.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T49.X, T38.X, 0.0, literal.x,
-; CM-NEXT: ASHR T48.Y, PV.X, literal.y,
-; CM-NEXT: ASHR * T38.Z, T38.Y, literal.x,
+; CM-NEXT: BFE_INT T49.X, T19.Y, 0.0, literal.x,
+; CM-NEXT: ASHR T47.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T48.Z, T19.Z, literal.x,
+; CM-NEXT: ASHR * T19.W, T19.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: BFE_INT T38.X, T38.Y, 0.0, literal.x,
+; CM-NEXT: BFE_INT T48.X, T19.Z, 0.0, literal.x,
; CM-NEXT: ASHR T49.Y, PV.X, literal.y,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ASHR T19.Z, T19.X, literal.x,
+; CM-NEXT: ASHR * T47.W, T0.W, literal.y,
+; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; CM-NEXT: BFE_INT T19.X, T19.X, 0.0, literal.x,
+; CM-NEXT: ASHR T48.Y, PV.X, literal.y,
+; CM-NEXT: ASHR T47.Z, T0.W, literal.x,
+; CM-NEXT: ASHR * T49.W, T0.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT: LSHR T50.X, PV.W, literal.x,
-; CM-NEXT: ASHR * T38.Y, PV.X, literal.y,
+; CM-NEXT: LSHR T50.X, T0.Z, literal.x,
+; CM-NEXT: ASHR T19.Y, PV.X, literal.y,
+; CM-NEXT: ASHR * T49.Z, T0.Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
%load = load <32 x i16>, ptr addrspace(1) %in
%ext = sext <32 x i16> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 3ef86c13e150ac..4564e2560ef2e6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -314,7 +314,8 @@ define void @load_local_lo_v2i16_reghi_vreg_zexti8(ptr addrspace(3) %in, i32 %re
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u8 v0, v0
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -409,7 +410,8 @@ define void @load_local_lo_v2i16_reghi_vreg_sexti8(ptr addrspace(3) %in, i32 %re
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_i8 v0, v0
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -878,7 +880,8 @@ define void @load_global_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %r
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -921,7 +924,8 @@ define void @load_global_lo_v2i16_reglo_vreg_sexti8(ptr addrspace(1) %in, i32 %r
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -964,7 +968,8 @@ define void @load_global_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %r
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -1008,7 +1013,8 @@ define void @load_global_lo_v2f16_reglo_vreg_sexti8(ptr addrspace(1) %in, i32 %r
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -1130,7 +1136,8 @@ define void @load_flat_lo_v2i16_reglo_vreg_zexti8(ptr %in, i32 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -1170,7 +1177,8 @@ define void @load_flat_lo_v2i16_reglo_vreg_sexti8(ptr %in, i32 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -1210,7 +1218,8 @@ define void @load_flat_lo_v2f16_reglo_vreg_zexti8(ptr %in, i32 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -1251,7 +1260,8 @@ define void @load_flat_lo_v2f16_reglo_vreg_sexti8(ptr %in, i32 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -1590,7 +1600,8 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8)
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
-; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -1640,7 +1651,8 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8(ptr addrspace(5) byval(i8)
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
-; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -1691,7 +1703,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in,
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
@@ -1741,7 +1754,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(ptr addrspace(5) %in,
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
@@ -1791,7 +1805,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in,
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
@@ -1927,7 +1942,8 @@ define void @load_constant_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(4) %in, i32
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -1971,7 +1987,8 @@ define void @load_constant_lo_v2f16_reglo_vreg_sexti8(ptr addrspace(4) %in, i32
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
@@ -2094,7 +2111,8 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
; GFX803-NEXT: v_mov_b32_e32 v2, 44
; GFX803-NEXT: buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
@@ -2163,7 +2181,8 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
; GFX803-NEXT: v_mov_b32_e32 v2, 44
; GFX803-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
@@ -2232,7 +2251,8 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
; GFX803-NEXT: v_mov_b32_e32 v2, 44
; GFX803-NEXT: buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
@@ -2302,7 +2322,8 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
; GFX803-NEXT: v_mov_b32_e32 v2, 44
; GFX803-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
index a4cf4d6ed2c8ec..d47eb4142f69d4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
@@ -7,6 +7,7 @@ define i32 @range_metadata_sext_i8_signed_range_i32(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: global_load_dword v0, v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8
; GCN-NEXT: s_setpc_b64 s[30:31]
%val = load volatile i32, ptr addrspace(1) %ptr, align 4, !range !0, !noundef !{} ; [-127, 128)
%shl = shl i32 %val, 24
@@ -48,7 +49,7 @@ define i32 @range_metadata_sext_i8_neg_neg_range_i32(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: global_load_dword v0, v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 63, v0
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 7
; GCN-NEXT: s_setpc_b64 s[30:31]
%val = load volatile i32, ptr addrspace(1) %ptr, align 4, !range !3, !noundef !{}
%shl = shl i32 %val, 25
@@ -60,8 +61,12 @@ define i32 @range_metadata_sextload_i8_signed_range_i4_i32(ptr addrspace(1) %ptr
; GCN-LABEL: range_metadata_sextload_i8_signed_range_i4_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: global_load_sbyte v0, v[0:1], off glc
+; GCN-NEXT: global_load_ubyte v0, v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, 3
+; GCN-NEXT: v_lshlrev_b16_e32 v0, 3, v0
+; GCN-NEXT: v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load volatile i8, ptr addrspace(1) %ptr, align 1, !range !4, !noundef !{}
%shl = shl i8 %load, 3
@@ -76,7 +81,9 @@ define i25 @range_metadata_sext_i8_signed_range_i25(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: global_load_dword v0, v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_bfe_i32 v0, v0, 0, 2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 30, v0
+; GCN-NEXT: v_ashrrev_i32_e32 v0, 7, v0
+; GCN-NEXT: v_ashrrev_i32_e32 v0, 23, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%val = load volatile i25, ptr addrspace(1) %ptr, align 4, !range !5, !noundef !{}
%shl = shl i25 %val, 23
@@ -90,6 +97,7 @@ define i32 @range_metadata_i32_neg1_to_1(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: global_load_dword v0, v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1
; GCN-NEXT: s_setpc_b64 s[30:31]
%val = load volatile i32, ptr addrspace(1) %ptr, align 4, !range !6, !noundef !{}
%shl = shl i32 %val, 31
@@ -103,8 +111,9 @@ define i64 @range_metadata_sext_i8_signed_range_i64(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 23, v0
-; GCN-NEXT: v_ashrrev_i64 v[0:1], 55, v[0:1]
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 23, v0
+; GCN-NEXT: v_ashrrev_i64 v[0:1], 55, v[1:2]
; GCN-NEXT: s_setpc_b64 s[30:31]
%val = load volatile i64, ptr addrspace(1) %ptr, align 4, !range !7, !noundef !{}
%shl = shl i64 %val, 55
@@ -118,6 +127,8 @@ define i64 @range_metadata_sext_i32_signed_range_i64(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
+; GCN-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
; GCN-NEXT: s_setpc_b64 s[30:31]
%val = load volatile i64, ptr addrspace(1) %ptr, align 4, !range !7, !noundef !{}
%shl = shl i64 %val, 31
@@ -131,6 +142,8 @@ define i64 @range_metadata_sext_i33_signed_range_i64(ptr addrspace(1) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1]
+; GCN-NEXT: v_ashrrev_i64 v[0:1], 30, v[0:1]
; GCN-NEXT: s_setpc_b64 s[30:31]
%val = load volatile i64, ptr addrspace(1) %ptr, align 4, !range !8, !noundef !{}
%shl = shl i64 %val, 30
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 89abdb2b754a44..c2a59730e7d953 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -1395,10 +1395,10 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; VI-NEXT: v_and_b32_e32 v1, -4, v0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: ds_read_b32 v3, v1
-; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: v_and_b32_e32 v0, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_mov_b32 s4, 0xffff
-; VI-NEXT: v_and_b32_e32 v0, 24, v2
-; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; VI-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; VI-NEXT: v_not_b32_e32 v2, v2
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: .LBB10_1: ; %atomicrmw.start
@@ -1432,10 +1432,10 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, -4, v0
; GFX9-NEXT: ds_read_b32 v3, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_and_b32_e32 v0, 24, v2
-; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX9-NEXT: v_not_b32_e32 v2, v2
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_movk_i32 s6, 0x7fff
@@ -1469,9 +1469,9 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: v_and_b32_e32 v1, -4, v0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b32 v3, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 24, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v0
; GFX7-NEXT: v_not_b32_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
@@ -1503,9 +1503,9 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: v_and_b32_e32 v1, -4, v0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v3, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 24, v2
-; GFX8-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
+; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT: v_lshl_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_not_b32_e32 v2, v2
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
@@ -1541,10 +1541,10 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; VI-NEXT: v_and_b32_e32 v1, -4, v0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: ds_read_b32 v3, v1
-; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: v_and_b32_e32 v0, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_mov_b32 s4, 0xffff
-; VI-NEXT: v_and_b32_e32 v0, 24, v2
-; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; VI-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; VI-NEXT: v_not_b32_e32 v2, v2
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: .LBB11_1: ; %atomicrmw.start
@@ -1577,10 +1577,10 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, -4, v0
; GFX9-NEXT: ds_read_b32 v3, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_and_b32_e32 v0, 24, v2
-; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX9-NEXT: v_not_b32_e32 v2, v2
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_movk_i32 s6, 0x7fff
@@ -1613,9 +1613,9 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: v_and_b32_e32 v1, -4, v0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b32 v3, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 24, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v0
; GFX7-NEXT: v_not_b32_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
@@ -1645,9 +1645,9 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: v_and_b32_e32 v1, -4, v0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v3, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 24, v2
-; GFX8-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
+; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT: v_lshl_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_not_b32_e32 v2, v2
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
@@ -2070,39 +2070,41 @@ define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bflo
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: ds_read_b32 v2, v0
-; VI-NEXT: s_mov_b64 s[6:7], 0
; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: .LBB16_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; VI-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-NEXT: v_add_f32_e32 v5, v5, v1
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_bfe_u32 v6, v2, 16, 1
-; VI-NEXT: v_bfe_u32 v8, v5, 16, 1
+; VI-NEXT: v_add_f32_e32 v5, v5, v1
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2
-; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; VI-NEXT: v_bfe_u32 v8, v5, 16, 1
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v8
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; VI-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; VI-NEXT: v_alignbit_b32 v2, v5, v2, 16
; VI-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
; VI-NEXT: s_cbranch_execnz .LBB16_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[6:7]
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2110,38 +2112,40 @@ define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bflo
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b32 v2, v0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX9-NEXT: s_movk_i32 s8, 0x7fff
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: s_mov_b32 s9, 0x7060302
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_movk_i32 s6, 0x7fff
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: s_mov_b32 s7, 0x7060302
; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX9-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX9-NEXT: v_add_f32_e32 v5, v5, v1
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX9-NEXT: v_add3_u32 v6, v6, v2, s6
+; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX9-NEXT: v_add3_u32 v6, v6, v2, s8
-; GFX9-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
+; GFX9-NEXT: v_add3_u32 v6, v8, v5, s6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX9-NEXT: v_perm_b32 v2, v5, v2, s9
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX9-NEXT: v_perm_b32 v2, v5, v2, s7
; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB16_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -2149,34 +2153,39 @@ define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bflo
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v4, v1
+; GFX7-NEXT: ds_read_b32 v1, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
; GFX7-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3
+; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v5
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2188,34 +2197,39 @@ define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bflo
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: ds_read_b32 v1, v0
; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_f32_e32 v6, v6, v2
; GFX8-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v5, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3
+; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2232,77 +2246,81 @@ define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %v
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: ds_read_b32 v3, v0
-; VI-NEXT: s_mov_b64 s[6:7], 0
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: .LBB17_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; VI-NEXT: v_add_f32_e32 v4, v4, v2
-; VI-NEXT: v_add_f32_e32 v5, v5, v1
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_bfe_u32 v6, v4, 16, 1
-; VI-NEXT: v_bfe_u32 v8, v5, 16, 1
+; VI-NEXT: v_add_f32_e32 v5, v5, v1
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v4
-; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; VI-NEXT: v_bfe_u32 v8, v5, 16, 1
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v8
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; VI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; VI-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; VI-NEXT: v_alignbit_b32 v4, v5, v4, 16
; VI-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; VI-NEXT: v_mov_b32_e32 v3, v4
-; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
; VI-NEXT: s_cbranch_execnz .LBB17_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[6:7]
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: lds_atomic_fadd_noret_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b32 v3, v0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT: s_movk_i32 s8, 0x7fff
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: s_mov_b32 s9, 0x7060302
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_movk_i32 s6, 0x7fff
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: s_mov_b32 s7, 0x7060302
; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; GFX9-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX9-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX9-NEXT: v_add_f32_e32 v5, v5, v1
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX9-NEXT: v_add3_u32 v6, v6, v4, s6
+; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX9-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX9-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX9-NEXT: v_add3_u32 v6, v8, v5, s6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX9-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX9-NEXT: v_perm_b32 v4, v5, v4, s7
; GFX9-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: lds_atomic_fadd_noret_v2bf16:
@@ -2312,30 +2330,35 @@ define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %v
; GFX7-NEXT: ds_read_b32 v4, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
; GFX7-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v5
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2349,30 +2372,35 @@ define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %v
; GFX8-NEXT: ds_read_b32 v4, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_add_f32_e32 v6, v6, v2
; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
index 7830bfc6ac7f59..c66a93a16757d0 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
@@ -6,12 +6,14 @@ define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s4, s4, 2
+; GCN-NEXT: s_andn2_b32 s4, s4, -2.0
+; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%and = and i32 %x, 1073741823
@@ -26,13 +28,14 @@ define amdgpu_kernel void @sext_shl64_to_32(ptr addrspace(1) nocapture %out, i32
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, s4, 0x1fffffff
-; GCN-NEXT: s_lshl_b32 s4, s4, 2
+; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%and = and i32 %x, 536870911
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 994ef22539a65f..7213153a97487f 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -478,8 +478,10 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2
+; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_bfe_u32 v2, v2, 8, 8
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
;
@@ -664,10 +666,14 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
-; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; CI-NEXT: v_and_b32_e32 v3, 0xff00ff, v3
-; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2
+; CI-NEXT: v_lshrrev_b32_e32 v4, 24, v3
+; CI-NEXT: v_lshrrev_b32_e32 v5, 24, v2
+; CI-NEXT: v_bfe_u32 v3, v3, 8, 8
+; CI-NEXT: v_bfe_u32 v2, v2, 8, 8
+; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_or_b32_e32 v3, v3, v4
+; CI-NEXT: v_or_b32_e32 v2, v2, v5
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 819b6ca98b3a83..7e120b155234bd 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -172,20 +172,22 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
}
define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 {
-; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
-; SDAG-GFX9: ; %bb.0:
-; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
; VI: ; %bb.0:
@@ -205,23 +207,6 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
-; GISEL-GFX9: ; %bb.0:
-; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -244,68 +229,44 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
}
define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 {
-; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
-; SDAG-GFX9: ; %bb.0:
-; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
-; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
-; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_mov_b32_e32 v1, 16
+; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; VI-NEXT: v_cvt_f16_f32_e32 v0, v2
+; VI-NEXT: v_mov_b32_e32 v1, 16
+; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
+; SDAG-CI-NEXT: v_bfe_i32 v0, v0, 0, 16
; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
-; GISEL-GFX9: ; %bb.0:
-; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 16
-; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
-; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
-; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16
-; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -528,3 +489,10 @@ declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone speculatable }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-GFX11: {{.*}}
+; GISEL-GFX9: {{.*}}
+; GISEL-VI: {{.*}}
+; SDAG-GFX11: {{.*}}
+; SDAG-GFX9: {{.*}}
+; SDAG-VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index bef9ff82aa396c..81aa9b09153f96 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -736,6 +736,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace
; SI-NEXT: s_sext_i32_i16 s4, s4
; SI-NEXT: s_sext_i32_i16 s5, s5
; SI-NEXT: s_max_i32 s4, s4, s5
+; SI-NEXT: s_bfe_i32 s4, s4, 0x100000
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -744,7 +745,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -757,9 +758,12 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace
; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: MAX_INT T0.X, PV.Z, PV.W,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MAX_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
%a.ext = sext i16 %a to i32
%b.ext = sext i16 %b to i32
%cmp = icmp sgt i32 %a.ext, %b.ext
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 940287d44d8d17..a9e3b200709ab8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -146,8 +146,8 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
; GCN-LABEL: mubuf_clause:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v31
-; GCN-NEXT: v_and_b32_e32 v2, 0x3ff0, v2
+; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GCN-NEXT: v_add_u32_e32 v0, v0, v2
; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:12
; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
@@ -205,8 +205,8 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
; GCN-SCRATCH-LABEL: mubuf_clause:
; GCN-SCRATCH: ; %bb.0: ; %bb
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v2, 4, v31
-; GCN-SCRATCH-NEXT: v_and_b32_e32 v18, 0x3ff0, v2
+; GCN-SCRATCH-NEXT: v_and_b32_e32 v2, 0x3ff, v31
+; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v18, 4, v2
; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v0, v18
; GCN-SCRATCH-NEXT: s_clause 0x3
; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[2:5], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 9dafa27ece86f6..8a77b3735da020 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -566,17 +566,17 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; CI-NEXT: s_sext_i32_i8 s8, s3
; CI-NEXT: s_bfe_i32 s9, s3, 0x80008
; CI-NEXT: s_bfe_i32 s3, s3, 0x80010
-; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: s_min_i32 s4, s4, s7
+; CI-NEXT: s_min_i32 s2, s2, s3
+; CI-NEXT: s_lshl_b32 s4, s4, 8
; CI-NEXT: s_and_b32 s2, s2, 0xff
-; CI-NEXT: s_lshl_b32 s4, s4, 24
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: s_or_b32 s2, s4, s2
+; CI-NEXT: s_or_b32 s2, s2, s4
; CI-NEXT: s_min_i32 s3, s6, s9
; CI-NEXT: s_min_i32 s4, s5, s8
; CI-NEXT: s_lshl_b32 s3, s3, 8
; CI-NEXT: s_and_b32 s4, s4, 0xff
; CI-NEXT: s_or_b32 s3, s4, s3
+; CI-NEXT: s_lshl_b32 s2, s2, 16
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_or_b32 s2, s3, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -827,13 +827,13 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @28, KC0[], KC1[]
; EG-NEXT: TEX 1 @12
-; EG-NEXT: ALU 9, @30, KC0[], KC1[]
+; EG-NEXT: ALU 11, @30, KC0[], KC1[]
; EG-NEXT: TEX 1 @16
-; EG-NEXT: ALU 10, @40, KC0[], KC1[]
+; EG-NEXT: ALU 10, @42, KC0[], KC1[]
; EG-NEXT: TEX 1 @20
-; EG-NEXT: ALU 10, @51, KC0[], KC1[]
+; EG-NEXT: ALU 11, @53, KC0[], KC1[]
; EG-NEXT: TEX 1 @24
-; EG-NEXT: ALU 11, @62, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 11, @65, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -857,13 +857,15 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, PV.X,
-; EG-NEXT: ALU clause starting at 40:
+; EG-NEXT: ALU clause starting at 42:
; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
@@ -875,11 +877,12 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 51:
+; EG-NEXT: ALU clause starting at 53:
; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
+; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
@@ -887,7 +890,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, PV.X,
-; EG-NEXT: ALU clause starting at 62:
+; EG-NEXT: ALU clause starting at 65:
; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
@@ -3353,7 +3356,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -3366,9 +3369,12 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: MIN_INT T0.X, PV.Z, PV.W,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
;
; CI-LABEL: simplify_demanded_bits_test_min_slt_i16:
; CI: ; %bb.0:
@@ -3379,6 +3385,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; CI-NEXT: s_sext_i32_i16 s2, s2
; CI-NEXT: s_sext_i32_i16 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
+; CI-NEXT: s_bfe_i32 s2, s2, 0x100000
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
@@ -3394,6 +3401,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_sext_i32_i16 s3, s3
; VI-NEXT: s_min_i32 s2, s2, s3
+; VI-NEXT: s_bfe_i32 s2, s2, 0x100000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -3410,6 +3418,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; GFX9-NEXT: s_sext_i32_i16 s2, s2
; GFX9-NEXT: s_sext_i32_i16 s3, s3
; GFX9-NEXT: s_min_i32 s2, s2, s3
+; GFX9-NEXT: s_bfe_i32 s2, s2, 0x100000
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -3425,6 +3434,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; GFX10-NEXT: s_sext_i32_i16 s2, s2
; GFX10-NEXT: s_sext_i32_i16 s3, s3
; GFX10-NEXT: s_min_i32 s2, s2, s3
+; GFX10-NEXT: s_bfe_i32 s2, s2, 0x100000
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -3441,6 +3451,8 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; GFX11-NEXT: s_sext_i32_i16 s3, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_min_i32 s2, s2, s3
+; GFX11-NEXT: s_bfe_i32 s2, s2, 0x100000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
@@ -4011,9 +4023,9 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @10
-; EG-NEXT: ALU 16, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 19, @18, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -4026,15 +4038,20 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: MOV * T3.X, T0.X,
+; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: ADD_INT * T7.X, KC0[2].W, T0.W,
-; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: LSHR T1.W, T0.X, literal.x,
-; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
+; EG-NEXT: ALU clause starting at 18:
+; EG-NEXT: MOV * T2.X, T7.X,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: LSHR * T1.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T8.X, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T0.Z, T7.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T1.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T2.W, PV.Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T7.X, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T1.Y, T1.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T0.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T1.W, T0.X, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT T1.W, PV.W, PV.Z,
; EG-NEXT: MIN_INT * T2.W, PV.Y, PV.X,
@@ -4164,9 +4181,9 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @10
-; EG-NEXT: ALU 13, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @18, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -4177,15 +4194,19 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: ADD_INT * T7.X, KC0[2].Z, T0.W,
-; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: LSHR T1.W, T0.X, literal.x,
-; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
+; EG-NEXT: MOV * T3.X, T0.X,
+; EG-NEXT: MOV T0.X, PV.X,
+; EG-NEXT: ADD_INT * T7.X, KC0[2].W, T0.W,
+; EG-NEXT: ALU clause starting at 18:
+; EG-NEXT: MOV * T2.X, T7.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: LSHR T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHR * T2.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.Z, T0.X, literal.x,
-; EG-NEXT: AND_INT T3.W, T7.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: AND_INT T0.Z, T0.Y, literal.x,
+; EG-NEXT: AND_INT T3.W, T0.X, literal.x,
; EG-NEXT: MIN_UINT * T1.W, PS, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL T1.W, PS, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 357b851a8f56f1..25a5cc19a6bdca 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -226,28 +226,40 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; SI-LABEL: test_smul48_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v6
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v4
; SI-NEXT: v_mul_i32_i24_e32 v0, v0, v4
-; SI-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v6
-; SI-NEXT: v_mul_i32_i24_e32 v2, v2, v6
+; SI-NEXT: v_ashr_i64 v[4:5], v[1:2], 40
+; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], 40
+; SI-NEXT: v_mul_hi_i32_i24_e32 v3, v4, v2
+; SI-NEXT: v_mul_i32_i24_e32 v2, v4, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: test_smul48_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT: v_ashrrev_i64 v[2:3], 40, v[0:1]
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; VI-NEXT: v_ashrrev_i64 v[5:6], 40, v[0:1]
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v4
; VI-NEXT: v_mul_i32_i24_e32 v0, v0, v4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v6
-; VI-NEXT: v_mul_i32_i24_e32 v2, v2, v6
+; VI-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v5
+; VI-NEXT: v_mul_i32_i24_e32 v2, v2, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_smul48_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; GFX9-NEXT: v_ashrrev_i64 v[2:3], 40, v[0:1]
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; GFX9-NEXT: v_ashrrev_i64 v[5:6], 40, v[0:1]
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v4
; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v4
-; GFX9-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v6
-; GFX9-NEXT: v_mul_i32_i24_e32 v2, v2, v6
+; GFX9-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v5
+; GFX9-NEXT: v_mul_i32_i24_e32 v2, v2, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: test_smul48_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 3a16c88f32cc3e..9efd43670ebb96 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -12,8 +12,8 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s2, s2, 0xffffff
-; SI-NEXT: s_and_b32 s3, s3, 0xffffff
+; SI-NEXT: s_bfe_u32 s2, s2, 0x180000
+; SI-NEXT: s_bfe_u32 s3, s3, 0x180000
; SI-NEXT: s_mul_i32 s2, s2, s3
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
@@ -30,8 +30,8 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_and_b32 s0, s2, 0xffffff
-; VI-NEXT: s_and_b32 s1, s3, 0xffffff
+; VI-NEXT: s_bfe_u32 s0, s2, 0x180000
+; VI-NEXT: s_bfe_u32 s1, s3, 0x180000
; VI-NEXT: s_mul_i32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -45,8 +45,8 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
-; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff
+; GFX9-NEXT: s_bfe_u32 s0, s2, 0x180000
+; GFX9-NEXT: s_bfe_u32 s1, s3, 0x180000
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -491,33 +491,37 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b
; SI-LABEL: test_umul24_i64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dword s8, s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s7, s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_and_b32 s4, s6, 0xffffff
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s5, s7, 0xffffff
-; SI-NEXT: v_mov_b32_e32 v0, s7
-; SI-NEXT: s_mul_i32 s4, s4, s5
-; SI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_lshl_b32 s1, s6, 8
+; SI-NEXT: s_lshr_b64 s[2:3], s[0:1], 40
+; SI-NEXT: s_lshl_b32 s1, s8, 8
+; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 40
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: s_mul_i32 s0, s2, s0
+; SI-NEXT: v_mul_hi_u32_u24_e32 v1, s2, v0
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s7, s[0:1], 0x34
+; VI-NEXT: s_load_dword s8, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_lshl_b32 s5, s6, 8
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 40
+; VI-NEXT: s_lshl_b32 s5, s8, 8
+; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 40
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0
; VI-NEXT: v_mul_u32_u24_e32 v0, s6, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -526,19 +530,21 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b
; GFX9-LABEL: test_umul24_i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34
+; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff
-; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s5
+; GFX9-NEXT: s_lshl_b32 s5, s6, 8
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 40
+; GFX9-NEXT: s_lshl_b32 s5, s8, 8
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 40
+; GFX9-NEXT: s_mul_hi_u32 s5, s6, s4
+; GFX9-NEXT: s_mul_i32 s4, s6, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
entry:
@@ -582,25 +588,30 @@ define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i32], i64 %a) {
; SI-LABEL: test_umul24_i64_square:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dword s4, s[0:1], 0x13
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dword s1, s[0:1], 0x13
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s5, s4, 0xffffff
-; SI-NEXT: s_mul_i32 s5, s5, s5
-; SI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4
-; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_lshl_b32 s1, s1, 8
+; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 40
+; SI-NEXT: s_mul_i32 s1, s0, s0
+; SI-NEXT: v_mul_hi_u32_u24_e64 v1, s0, s0
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i64_square:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
+; VI-NEXT: s_load_dword s5, s[0:1], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s5, s5, 8
+; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 40
; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4
; VI-NEXT: v_mul_u32_u24_e64 v0, s4, s4
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -610,10 +621,12 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
+; GFX9-NEXT: s_lshl_b32 s1, s2, 8
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 40
; GFX9-NEXT: s_mul_hi_u32 s1, s0, s0
; GFX9-NEXT: s_mul_i32 s0, s0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -685,53 +698,71 @@ entry:
define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
; SI-LABEL: test_umul24_i33:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s2, s[0:1], 0xb
-; SI-NEXT: s_load_dword s0, s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s1, s2, 0xffffff
-; SI-NEXT: s_and_b32 s3, s0, 0xffffff
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0
-; SI-NEXT: s_mul_i32 s1, s1, s3
-; SI-NEXT: v_and_b32_e32 v1, 1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 9
+; SI-NEXT: s_lshl_b64 s[6:7], s[8:9], 9
+; SI-NEXT: s_and_b32 s5, s5, 1
+; SI-NEXT: s_and_b32 s7, s7, 1
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_alignbit_b32 v2, s7, v0, 9
+; SI-NEXT: v_alignbit_b32 v1, s5, v1, 9
+; SI-NEXT: v_mul_lo_u32 v0, v1, v2
+; SI-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v1, 1, v1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mul_u32_u24_e32 v0, s5, v1
-; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s5, v1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 9
+; VI-NEXT: s_lshl_b64 s[6:7], s[8:9], 9
+; VI-NEXT: s_and_b32 s7, s7, 1
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_and_b32 s5, s5, 1
+; VI-NEXT: v_alignbit_b32 v1, s7, v0, 9
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_alignbit_b32 v2, s5, v0, 9
+; VI-NEXT: v_mul_u32_u24_e32 v0, v2, v1
+; VI-NEXT: v_mul_hi_u32_u24_e32 v1, v2, v1
; VI-NEXT: v_and_b32_e32 v1, 1, v1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i33:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff
-; GFX9-NEXT: s_mul_i32 s2, s0, s1
-; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s0, s0, 1
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 9
+; GFX9-NEXT: s_lshl_b64 s[6:7], s[8:9], 9
+; GFX9-NEXT: s_and_b32 s1, s1, 1
+; GFX9-NEXT: s_and_b32 s7, s7, 1
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 9
+; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 9
+; GFX9-NEXT: v_mul_hi_u32 v2, v1, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
entry:
%tmp0 = shl i33 %a, 9
@@ -747,46 +778,68 @@ entry:
define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
; SI-LABEL: test_umulhi24_i33:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dword s4, s[0:1], 0xd
-; SI-NEXT: s_load_dword s5, s[0:1], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 9
+; SI-NEXT: s_lshl_b64 s[6:7], s[8:9], 9
+; SI-NEXT: s_and_b32 s5, s5, 1
+; SI-NEXT: s_and_b32 s7, s7, 1
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_alignbit_b32 v0, s7, v0, 9
+; SI-NEXT: v_alignbit_b32 v1, s5, v1, 9
+; SI-NEXT: v_mul_hi_u32_u24_e32 v0, v1, v0
; SI-NEXT: v_and_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umulhi24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 9
+; VI-NEXT: s_lshl_b64 s[6:7], s[8:9], 9
+; VI-NEXT: s_and_b32 s5, s5, 1
+; VI-NEXT: s_and_b32 s7, s7, 1
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_alignbit_b32 v0, s7, v0, 9
+; VI-NEXT: v_alignbit_b32 v1, s5, v1, 9
+; VI-NEXT: v_mul_hi_u32_u24_e32 v0, v1, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umulhi24_i33:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff
-; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s0, s0, 1
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 9
+; GFX9-NEXT: s_lshl_b64 s[6:7], s[8:9], 9
+; GFX9-NEXT: s_and_b32 s1, s1, 1
+; GFX9-NEXT: s_and_b32 s2, s7, 1
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, 9
+; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 9
+; GFX9-NEXT: v_mul_hi_u32 v0, v1, v0
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
entry:
%tmp0 = shl i33 %a, 9
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll
index 31a1124f474783..93ac6bf438ad30 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll
@@ -5,30 +5,36 @@
define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; CM-LABEL: test_umul24_i32:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
+; CM-NEXT: LSHL T0.Z, KC0[2].Z, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[2].W, literal.x,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.y,
-; CM-NEXT: AND_INT * T0.W, KC0[2].Z, literal.y,
-; CM-NEXT: 2(2.802597e-45), 16777215(2.350989e-38)
-; CM-NEXT: MULLO_INT T1.X, T0.W, T0.Z,
-; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.Z,
-; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.Z,
-; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.Z,
+; CM-NEXT: LSHR T1.Z, PV.W, literal.y,
+; CM-NEXT: LSHR * T0.W, PV.Z, literal.y,
+; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
+; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
+; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
+; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
;
; EG-LABEL: test_umul24_i32:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Z, literal.x,
-; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.W, PS, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -218,39 +224,44 @@ entry:
define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; CM-LABEL: test_umul24_i64:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
+; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; CM-NEXT: AND_INT * T0.Z, KC0[3].Y, literal.y,
-; CM-NEXT: 2(2.802597e-45), 16777215(2.350989e-38)
-; CM-NEXT: AND_INT * T0.W, KC0[2].W, literal.x,
-; CM-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
-; CM-NEXT: MULLO_INT T1.X, T0.W, T0.Z,
-; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.Z,
-; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.Z,
-; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.Z,
-; CM-NEXT: MULHI_UINT24 T1.X (MASKED), KC0[2].W, KC0[3].Y,
-; CM-NEXT: MULHI_UINT24 T1.Y, KC0[2].W, KC0[3].Y,
-; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y,
-; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y,
+; CM-NEXT: LSHR T1.Z, PV.W, literal.y,
+; CM-NEXT: LSHR * T0.W, PV.Z, literal.y,
+; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT: MULHI_UINT24 T1.X (MASKED), T0.W, T1.Z,
+; CM-NEXT: MULHI_UINT24 T1.Y, T0.W, T1.Z,
+; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), T0.W, T1.Z,
+; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), T0.W, T1.Z,
+; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
+; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
+; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
+; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
;
; EG-LABEL: test_umul24_i64:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT T0.W, KC0[3].Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x,
-; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
-; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
+; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.W, PS, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: MULHI_UINT24 * T0.Y, PS, PV.W,
+; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, T1.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MULHI_UINT24 * T1.Y, KC0[2].W, KC0[3].Y,
entry:
%tmp0 = shl i64 %a, 40
%a_24 = lshr i64 %tmp0, 40
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 8ac332197215f5..54a85e79124482 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -611,50 +611,51 @@ define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %i
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s1, s1, 8
-; GFX10-NEXT: s_lshr_b32 s4, s9, 16
-; GFX10-NEXT: v_lshlrev_b16 v0, 8, s9
-; GFX10-NEXT: v_and_b32_e64 v1, 0xffffff00, s8
-; GFX10-NEXT: v_lshlrev_b16 v2, 8, s4
-; GFX10-NEXT: v_lshlrev_b16 v3, 8, s8
-; GFX10-NEXT: s_lshr_b32 s4, s0, 16
-; GFX10-NEXT: v_or_b32_sdwa v0, s1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX10-NEXT: v_lshrrev_b16 v0, 8, s0
+; GFX10-NEXT: s_lshr_b32 s5, s1, 16
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, s1
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, s5
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, s0
+; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
+; GFX10-NEXT: s_lshr_b32 s4, s9, 8
+; GFX10-NEXT: s_lshr_b32 s0, s8, 16
+; GFX10-NEXT: v_or_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v1, s8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v2, s8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v0, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: shuffle8i8:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT: v_mov_b32_e32 v0, 0xffffff00
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s1, s1, 8
-; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s9
-; GFX9-NEXT: v_or_b32_sdwa v4, s1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e64 v0, 8, s9
+; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s8
+; GFX9-NEXT: v_lshrrev_b16_e64 v3, 8, s8
+; GFX9-NEXT: v_or_b32_sdwa v0, s1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_lshr_b32 s1, s9, 16
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
-; GFX9-NEXT: v_lshlrev_b16_e64 v3, 8, s8
-; GFX9-NEXT: v_and_b32_e32 v0, s8, v0
-; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s1
-; GFX9-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; GFX9-NEXT: v_lshlrev_b16_e64 v3, 8, s1
+; GFX9-NEXT: v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
bb:
@@ -788,17 +789,16 @@ define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9
; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4
-; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00
+; GFX10-NEXT: v_add_nc_u16 v1, v0, v1
; GFX10-NEXT: v_add_nc_u16 v3, v2, v9
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
-; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: global_store_dword v[5:6], v0, off
-; GFX10-NEXT: global_store_dword v[7:8], v1, off
+; GFX10-NEXT: global_store_dword v[5:6], v1, off
+; GFX10-NEXT: global_store_dword v[7:8], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_store:
@@ -806,19 +806,19 @@ define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dword v9, v[2:3], off
-; GFX9-NEXT: s_movk_i32 s4, 0xff00
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4
-; GFX9-NEXT: v_and_b32_sdwa v1, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v4
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX9-NEXT: v_or_b32_e32 v1, v0, v1
-; GFX9-NEXT: v_add_u16_e32 v0, v0, v9
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT: v_add_u16_e32 v3, v0, v9
+; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: global_store_dword v[5:6], v0, off
-; GFX9-NEXT: global_store_dword v[7:8], v1, off
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: global_store_dword v[5:6], v1, off
+; GFX9-NEXT: global_store_dword v[7:8], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
@@ -848,17 +848,16 @@ define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9
; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4
-; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00
+; GFX10-NEXT: v_add_nc_u16 v1, v0, v1
; GFX10-NEXT: v_add_nc_u16 v3, v2, v9
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
-; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: global_store_dword v[5:6], v0, off
-; GFX10-NEXT: global_store_dword v[7:8], v1, off
+; GFX10-NEXT: global_store_dword v[5:6], v1, off
+; GFX10-NEXT: global_store_dword v[7:8], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_store_div_16:
@@ -872,17 +871,17 @@ define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_movk_i32 s4, 0xff00
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v9
-; GFX9-NEXT: v_and_b32_sdwa v2, v9, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_e32 v2, v1, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v9
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u16_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NEXT: v_add_u16_sdwa v2, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: global_store_dword v[5:6], v0, off
; GFX9-NEXT: global_store_dword v[7:8], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -989,13 +988,13 @@ define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_and_b32_sdwa v2, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_e32 v3, 0x100, v9
+; GFX10-NEXT: v_and_b32_sdwa v3, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v9, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5070006
+; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: global_store_dword v[5:6], v0, off
; GFX10-NEXT: global_store_dword v[7:8], v1, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -1014,16 +1013,15 @@ define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: s_mov_b32 s4, 0x5070006
; GFX9-NEXT: v_mov_b32_e32 v0, 2
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4
; GFX9-NEXT: v_and_b32_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v9, 0x100, v4
+; GFX9-NEXT: v_and_b32_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_e32 v3, v9, v3
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[5:6], v0, off
; GFX9-NEXT: global_store_dword v[7:8], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -1295,12 +1293,12 @@ define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: v_mov_b32_e32 v0, 26
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshrrev_b16 v1, 1, v4
+; GFX10-NEXT: v_lshrrev_b16 v1, 9, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 25, v9
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 26, v4
-; GFX10-NEXT: v_and_b32_e32 v1, 0x7f00, v1
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1320,21 +1318,21 @@ define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dword v9, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v0, 26
; GFX9-NEXT: s_mov_b32 s4, 0x1030707
+; GFX9-NEXT: v_mov_b32_e32 v0, 26
+; GFX9-NEXT: v_mov_b32_e32 v1, 9
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b16_e32 v3, 1, v4
+; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v2, v9, v4, s4
; GFX9-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 25, v9
-; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 25, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 26, v4
-; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX9-NEXT: v_and_b32_e32 v2, 0x7f00, v3
-; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[5:6], v0, off
-; GFX9-NEXT: global_store_dword v[7:8], v1, off
+; GFX9-NEXT: global_store_dword v[7:8], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2582,27 +2580,24 @@ define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: global_load_dword v9, v[2:3], off
-; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffff00
-; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: v_mov_b32_e32 v2, 2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: global_load_dword v4, v[2:3], off
+; GFX10-NEXT: global_load_dword v9, v[0:1], off
+; GFX10-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v9
-; GFX10-NEXT: v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT: v_xor_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_xor_b32_e32 v0, 0x200, v0
-; GFX10-NEXT: v_xor_b32_e32 v3, 0x100, v3
-; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5060307
+; GFX10-NEXT: v_xor_b32_sdwa v2, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_xor_b32_sdwa v3, v9, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_xor_b32_sdwa v0, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-NEXT: v_xor_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x5060307
+; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: global_store_dword v[5:6], v0, off
; GFX10-NEXT: global_store_dword v[7:8], v1, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -2618,24 +2613,20 @@ define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dword v9, v[2:3], off
-; GFX9-NEXT: s_movk_i32 s4, 0xff00
+; GFX9-NEXT: s_mov_b32 s4, 0x5060307
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, 2
-; GFX9-NEXT: s_mov_b32 s5, 0x5060307
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_sdwa v2, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff00, v9
-; GFX9-NEXT: v_xor_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v2, v9, v4, s4
+; GFX9-NEXT: v_xor_b32_sdwa v3, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_xor_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_xor_b32_sdwa v0, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX9-NEXT: v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_xor_b32_e32 v2, 0x200, v2
-; GFX9-NEXT: v_xor_b32_e32 v3, 0x100, v3
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_perm_b32 v4, v9, v4, s5
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[5:6], v0, off
-; GFX9-NEXT: global_store_dword v[7:8], v4, off
+; GFX9-NEXT: global_store_dword v[7:8], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -3614,14 +3605,14 @@ define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX10-LABEL: extract_3src:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:4
+; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT: v_mov_b32_e32 v0, 16
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v7
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v8
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX10-NEXT: v_and_b32_e32 v0, 0xff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_and_b32_e32 v1, 0xff000000, v1
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 8, v2
; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1
@@ -3633,15 +3624,15 @@ define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:4
+; GFX9-NEXT: v_mov_b32_e32 v0, 16
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
+; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v6
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v8
-; GFX9-NEXT: v_and_b32_e32 v1, 0xff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX9-NEXT: v_and_b32_e32 v2, 0xff000000, v2
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0
-; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT: v_lshl_or_b32 v1, v1, 8, v1
+; GFX9-NEXT: v_or3_b32 v0, v1, v0, v2
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index 77e1694dbe7e19..1f48eacd396744 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -1606,11 +1606,12 @@ define <2 x i16> @v_mul_add_1_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX67-LABEL: v_mul_add_1_v2i16:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0x10000, v3
+; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2
; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3
@@ -1649,10 +1650,11 @@ define <2 x i16> @v_mul_add_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) {
; GFX67-LABEL: v_mul_add_1_v2i16_commute:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3
+; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0x10000, v3
+; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v2, v0
@@ -1701,7 +1703,7 @@ define <2 x i16> @v_mul_add_x_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_add_x_v2i16:
@@ -1737,11 +1739,12 @@ define <2 x i16> @v_mul_sub_1_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX67-LABEL: v_mul_sub_1_v2i16:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_add_i32_e32 v2, vcc, -1, v2
-; GFX67-NEXT: v_add_i32_e32 v3, vcc, -1, v3
-; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0xffff0000, v3
+; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2
; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3
@@ -1780,10 +1783,11 @@ define <2 x i16> @v_mul_sub_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) {
; GFX67-LABEL: v_mul_sub_1_v2i16_commute:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_add_i32_e32 v2, vcc, -1, v2
-; GFX67-NEXT: v_add_i32_e32 v3, vcc, -1, v3
+; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0xffff0000, v3
+; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v2, v0
@@ -1834,7 +1838,7 @@ define <2 x i16> @v_mul_sub_x_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_sub_x_v2i16:
@@ -1870,11 +1874,12 @@ define <2 x i16> @v_mul_add_2_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX67-LABEL: v_mul_add_2_v2i16:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_add_i32_e32 v2, vcc, 2, v2
-; GFX67-NEXT: v_add_i32_e32 v3, vcc, 2, v3
-; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0x20000, v3
+; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2
; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3
@@ -1913,11 +1918,12 @@ define <2 x i16> @v_mul_sub_2_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX67-LABEL: v_mul_sub_2_v2i16:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_add_i32_e32 v2, vcc, -2, v2
-; GFX67-NEXT: v_add_i32_e32 v3, vcc, -2, v3
-; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0xfffe0000, v3
+; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2
; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3
@@ -2695,14 +2701,15 @@ define <2 x i16> @v_mul_9_add_52_v2i16(<2 x i16> %arg) {
; GFX67-LABEL: v_mul_9_add_52_v2i16:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-NEXT: v_mad_u32_u24 v1, v1, 9, 52
+; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT: v_mul_u32_u24_e32 v1, 9, v1
; GFX67-NEXT: v_mad_u32_u24 v0, v0, 9, 52
-; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 0x340000, v1
+; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX67-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_9_add_52_v2i16:
@@ -2929,14 +2936,15 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) {
; GFX67-LABEL: v_mul_5_add_1_v2i16:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-NEXT: v_mad_u32_u24 v1, v1, 5, 1
+; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT: v_mul_u32_u24_e32 v1, 5, v1
; GFX67-NEXT: v_mad_u32_u24 v0, v0, 5, 1
-; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 0x10000, v1
+; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX67-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_5_add_1_v2i16:
@@ -2971,16 +2979,17 @@ define <2 x i16> @v_mul_284_add_82_v2i16(<2 x i16> %arg) {
; GFX67-LABEL: v_mul_284_add_82_v2i16:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-NEXT: s_movk_i32 s4, 0x11c
+; GFX67-NEXT: v_mul_u32_u24_e32 v1, 0x11c, v1
; GFX67-NEXT: v_mov_b32_e32 v2, 0x52
-; GFX67-NEXT: v_mad_u32_u24 v1, v1, s4, v2
; GFX67-NEXT: v_mad_u32_u24 v0, v0, s4, v2
-; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX67-NEXT: v_and_b32_e32 v0, 0xfffe, v0
-; GFX67-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX67-NEXT: v_and_b32_e32 v1, 0xfffe, v1
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 0x520000, v1
+; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX67-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_284_add_82_v2i16:
@@ -3635,20 +3644,20 @@ define <2 x i8> @v_mul_add_1_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mad_u16 v1, v1, v3, v1
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v1
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX8-NEXT: v_mad_u16 v0, v0, v2, v0
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_add_1_v2i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v3, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v2, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_add_1_v2i8:
@@ -3656,9 +3665,9 @@ define <2 x i8> @v_mul_add_1_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mad_u16 v1, v1, v3, v1
; GFX10-NEXT: v_mad_u16 v0, v0, v2, v0
-; GFX10-NEXT: v_lshlrev_b16 v2, 8, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b16 v1, 8, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%add = add <2 x i8> %y, <i8 1, i8 1>
%mul = mul <2 x i8> %x, %add
@@ -3686,20 +3695,20 @@ define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mad_u16 v1, v1, v3, v1
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v1
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX8-NEXT: v_mad_u16 v0, v0, v2, v0
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_add_1_v2i8_commute:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v3, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v2, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_add_1_v2i8_commute:
@@ -3707,9 +3716,9 @@ define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mad_u16 v1, v1, v3, v1
; GFX10-NEXT: v_mad_u16 v0, v0, v2, v0
-; GFX10-NEXT: v_lshlrev_b16 v2, 8, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b16 v1, 8, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%add = add <2 x i8> %y, <i8 1, i8 1>
%mul = mul <2 x i8> %add, %x
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index b068d87c4d6f48..f5a0c293755321 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -1343,11 +1343,15 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v24, v6
; GFX9-O0-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v0, v5, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-O0-NEXT: ; implicit-def: $sgpr7
+; GFX9-O0-NEXT: ; implicit-def: $sgpr7
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, s6
+; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19
+; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], s4, v[5:6]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], s4, v[5:6]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v24
@@ -2737,11 +2741,15 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v20, v8
; GFX9-O0-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v5, v7, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, s5
-; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7
+; GFX9-O0-NEXT: ; implicit-def: $sgpr7
+; GFX9-O0-NEXT: ; implicit-def: $sgpr7
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, s6
+; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15
+; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], s4, v[7:8]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[7:8]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20
@@ -2877,14 +2885,22 @@ define i128 @v_srem_i128_v_pow2k(i128 %lhs) {
; GFX9-NEXT: v_mov_b32_e32 v5, v4
; GFX9-NEXT: v_lshrrev_b64 v[4:5], 31, v[4:5]
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v2, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
-; GFX9-NEXT: v_and_b32_e32 v4, -2, v4
-; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, 0, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v2, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], 31, v[4:5]
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v5
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], 33, v[4:5]
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v10
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 33, v[4:5]
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], 31, v[6:7]
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT: v_or_b32_e32 v6, v8, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-NEXT: v_or_b32_e32 v7, v9, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-O0-LABEL: v_srem_i128_v_pow2k:
@@ -2912,39 +2928,44 @@ define i128 @v_srem_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7
-; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: s_mov_b32 s5, s6
-; GFX9-O0-NEXT: s_mov_b32 s4, s7
+; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-O0-NEXT: s_mov_b32 s6, s8
+; GFX9-O0-NEXT: s_mov_b32 s5, s9
; GFX9-O0-NEXT: v_add_co_u32_e32 v6, vcc, v5, v4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v2, vcc
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v0, v2, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v3, v2, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v3, v2, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v1, v2, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT: s_mov_b32 s6, -2
-; GFX9-O0-NEXT: s_mov_b32 s4, 0
-; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
-; GFX9-O0-NEXT: s_mov_b32 s5, s6
-; GFX9-O0-NEXT: s_mov_b32 s6, s5
-; GFX9-O0-NEXT: v_and_b32_e64 v4, v4, s6
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
-; GFX9-O0-NEXT: v_and_b32_e64 v9, v6, s4
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT: v_lshlrev_b64 v[11:12], s4, v[11:12]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s4, v[11:12]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT: s_mov_b32 s4, 33
+; GFX9-O0-NEXT: v_lshrrev_b64 v[9:10], s4, v[9:10]
+; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10
+; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v9
+; GFX9-O0-NEXT: v_or_b32_e64 v9, v4, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[6:7]
+; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v7
; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 9c5214338c54a7..ab65b3adf6915c 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -28,8 +28,9 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
; GFX9-LABEL: v_saddsat_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 8
+; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -37,6 +38,8 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
; GFX10PLUS-LABEL: v_saddsat_i8:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp
@@ -142,7 +145,7 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 5260a4847f70d4..064627059d2b48 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -105,12 +105,15 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
-; SI-NEXT: v_or_b32_e32 v2, v1, v0
-; SI-NEXT: v_and_b32_e32 v1, 0xff00, v2
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; SI-NEXT: v_or_b32_e32 v1, v0, v3
-; SI-NEXT: v_or_b32_e32 v0, v2, v3
+; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: s_bfe_u32 s0, s0, 0x180000
+; SI-NEXT: s_lshl_b32 s1, s0, 8
+; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: s_lshl_b32 s1, s0, 16
+; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: v_or_b32_e32 v0, s1, v0
+; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -121,12 +124,13 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0
-; VI-NEXT: v_or_b32_e32 v2, v1, v0
-; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v2
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; VI-NEXT: v_or_b32_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
+; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
bb:
@@ -145,12 +149,15 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
-; SI-NEXT: v_or_b32_e32 v2, v1, v0
-; SI-NEXT: v_and_b32_e32 v1, 0xff00, v2
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; SI-NEXT: v_or_b32_e32 v1, v0, v3
-; SI-NEXT: v_or_b32_e32 v0, v2, v3
+; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: s_bfe_u32 s0, s0, 0x180000
+; SI-NEXT: s_lshl_b32 s1, s0, 8
+; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: s_lshl_b32 s1, s0, 16
+; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: v_or_b32_e32 v0, s1, v0
+; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -162,8 +169,9 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0
-; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
+; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index f7eb42a5f93227..079f17ea43fff8 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -90,9 +90,9 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xe8f000
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; SI-NEXT: s_add_u32 s0, s0, s4
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_addc_u32 s1, s1, 0
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
@@ -216,9 +216,9 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s3, 0xe80000
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
@@ -361,7 +361,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -374,7 +374,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
@@ -470,7 +470,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
@@ -481,7 +481,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -582,7 +582,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
@@ -593,7 +593,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -662,9 +662,9 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-FLATSCR-LABEL: ps_main:
; GFX9-FLATSCR: ; %bb.0:
; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2
-; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
@@ -757,13 +757,13 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -844,8 +844,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
@@ -947,13 +947,13 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -1031,45 +1031,47 @@ define amdgpu_ps float @ps_main(i32 %idx) {
;
; GFX11-FLATSCR-LABEL: ps_main:
; GFX11-FLATSCR: ; %bb.0:
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
-; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, 0xbf523be3 :: v_dual_mov_b32 v23, v21
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v8, 0x3f3d349e :: v_dual_lshlrev_b32 v37, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, 0x3f638e37 :: v_dual_mov_b32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v36, v6
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
@@ -1165,9 +1167,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xe8f000
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; SI-NEXT: s_add_u32 s0, s0, s4
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_addc_u32 s1, s1, 0
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
@@ -1291,9 +1293,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s3, 0xe80000
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
@@ -1436,7 +1438,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -1449,7 +1451,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
@@ -1545,7 +1547,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
@@ -1556,7 +1558,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -1657,7 +1659,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
@@ -1668,7 +1670,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -1737,9 +1739,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-FLATSCR-LABEL: vs_main:
; GFX9-FLATSCR: ; %bb.0:
; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2
-; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
@@ -1832,13 +1834,13 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -1919,8 +1921,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2022,13 +2024,13 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -2106,45 +2108,47 @@ define amdgpu_vs float @vs_main(i32 %idx) {
;
; GFX11-FLATSCR-LABEL: vs_main:
; GFX11-FLATSCR: ; %bb.0:
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
-; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, 0xbf523be3 :: v_dual_mov_b32 v23, v21
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v8, 0x3f3d349e :: v_dual_lshlrev_b32 v37, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, 0x3f638e37 :: v_dual_mov_b32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v36, v6
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
@@ -2237,9 +2241,9 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xe8f000
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; SI-NEXT: s_add_u32 s0, s0, s4
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_addc_u32 s1, s1, 0
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
@@ -2363,9 +2367,9 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s3, 0xe80000
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
@@ -2508,7 +2512,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -2521,7 +2525,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
@@ -2617,7 +2621,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
@@ -2628,7 +2632,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -2729,7 +2733,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
@@ -2740,7 +2744,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -2809,9 +2813,9 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-FLATSCR-LABEL: cs_main:
; GFX9-FLATSCR: ; %bb.0:
; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2
-; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-FLATSCR-NEXT: v_and_b32_e32 v27, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v27, 2, v0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
@@ -2906,13 +2910,13 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -2993,8 +2997,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10
-; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v27, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v27, 2, v0
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
@@ -3098,13 +3102,13 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -3182,45 +3186,47 @@ define amdgpu_cs float @cs_main(i32 %idx) {
;
; GFX11-FLATSCR-LABEL: cs_main:
; GFX11-FLATSCR: ; %bb.0:
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
-; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, 0xbf523be3 :: v_dual_mov_b32 v23, v21
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v8, 0x3f3d349e :: v_dual_lshlrev_b32 v37, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, 0x3f638e37 :: v_dual_mov_b32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v36, v6
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
@@ -3295,9 +3301,9 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xe8f000
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; SI-NEXT: s_add_u32 s0, s0, s4
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_addc_u32 s1, s1, 0
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
@@ -3421,9 +3427,9 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s3, 0xe80000
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
@@ -3565,7 +3571,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -3578,7 +3584,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
@@ -3673,7 +3679,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
@@ -3684,7 +3690,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -3784,7 +3790,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
@@ -3795,7 +3801,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -3864,9 +3870,9 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-FLATSCR-LABEL: hs_main:
; GFX9-FLATSCR: ; %bb.0:
; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
-; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
@@ -3959,13 +3965,13 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -4046,8 +4052,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
@@ -4149,13 +4155,13 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -4233,45 +4239,47 @@ define amdgpu_hs float @hs_main(i32 %idx) {
;
; GFX11-FLATSCR-LABEL: hs_main:
; GFX11-FLATSCR: ; %bb.0:
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
-; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, 0xbf523be3 :: v_dual_mov_b32 v23, v21
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v8, 0x3f3d349e :: v_dual_lshlrev_b32 v37, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, 0x3f638e37 :: v_dual_mov_b32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v36, v6
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
@@ -4365,9 +4373,9 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xe8f000
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; SI-NEXT: s_add_u32 s0, s0, s4
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_addc_u32 s1, s1, 0
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
@@ -4491,9 +4499,9 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s3, 0xe80000
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
@@ -4635,7 +4643,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -4648,7 +4656,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
@@ -4743,7 +4751,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
@@ -4754,7 +4762,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -4854,7 +4862,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
@@ -4865,7 +4873,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -4934,9 +4942,9 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-FLATSCR-LABEL: gs_main:
; GFX9-FLATSCR: ; %bb.0:
; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
-; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
@@ -5029,13 +5037,13 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -5116,8 +5124,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
@@ -5219,13 +5227,13 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -5303,45 +5311,47 @@ define amdgpu_gs float @gs_main(i32 %idx) {
;
; GFX11-FLATSCR-LABEL: gs_main:
; GFX11-FLATSCR: ; %bb.0:
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
-; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, 0xbf523be3 :: v_dual_mov_b32 v23, v21
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v8, 0x3f3d349e :: v_dual_lshlrev_b32 v37, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, 0x3f638e37 :: v_dual_mov_b32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v36, v6
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
@@ -5442,9 +5452,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s11, 0xe8f000
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; SI-NEXT: s_add_u32 s8, s8, s6
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_addc_u32 s9, s9, 0
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
@@ -5568,9 +5578,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s11, 0xe80000
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; VI-NEXT: s_add_u32 s8, s8, s6
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_addc_u32 s9, s9, 0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
@@ -5713,7 +5723,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -5726,7 +5736,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
@@ -5822,7 +5832,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
@@ -5833,7 +5843,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -5934,7 +5944,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
@@ -5945,7 +5955,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -6015,9 +6025,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-FLATSCR-LABEL: hs_ir_uses_scratch_offset:
; GFX9-FLATSCR: ; %bb.0:
; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
-; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
@@ -6110,14 +6120,14 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -6199,8 +6209,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
@@ -6302,14 +6312,14 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -6388,41 +6398,44 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
;
; GFX11-FLATSCR-LABEL: hs_ir_uses_scratch_offset:
; GFX11-FLATSCR: ; %bb.0:
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v6, 0x3f5f2ee2
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_lshlrev_b32_e32 v37, 2, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v27, v24
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
-; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v30, v13
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
@@ -6523,9 +6536,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s11, 0xe8f000
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; SI-NEXT: s_add_u32 s8, s8, s6
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_addc_u32 s9, s9, 0
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
@@ -6649,9 +6662,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s11, 0xe80000
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; VI-NEXT: s_add_u32 s8, s8, s6
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_addc_u32 s9, s9, 0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
@@ -6794,7 +6807,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -6807,7 +6820,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
@@ -6903,7 +6916,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
@@ -6914,7 +6927,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -7015,7 +7028,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
@@ -7026,7 +7039,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -7096,9 +7109,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-FLATSCR-LABEL: gs_ir_uses_scratch_offset:
; GFX9-FLATSCR: ; %bb.0:
; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
-; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
@@ -7191,14 +7204,14 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -7280,8 +7293,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v23, 2, v0
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
@@ -7383,14 +7396,14 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
-; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
@@ -7469,41 +7482,44 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
;
; GFX11-FLATSCR-LABEL: gs_ir_uses_scratch_offset:
; GFX11-FLATSCR: ; %bb.0:
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v6, 0x3f5f2ee2
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_lshlrev_b32_e32 v37, 2, v0
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v27, v24
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
-; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v30, v13
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index ea30a63b0be19a..36483bc54922b5 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -1760,7 +1760,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 3 @6
-; EG-NEXT: ALU 33, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 37, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -1803,6 +1803,10 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG-NEXT: ADD_INT * T0.W, PS, PV.W,
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
+; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T0.X, PV.W, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 9(1.261169e-44), 2(2.802597e-45)
@@ -1943,7 +1947,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 3 @6
-; EG-NEXT: ALU 29, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 33, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -1982,6 +1986,10 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG-NEXT: ADD_INT * T0.W, PS, PV.W,
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T0.X, PV.W, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index b086640c72f804..f5b814f1ed6ee4 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -27,77 +27,83 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v0
; GCN-NEXT: s_add_u32 s2, s2, s12
; GCN-NEXT: s_mov_b32 s13, s12
-; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s5, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s4, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s4, v3
+; GCN-NEXT: v_mul_lo_u32 v5, s5, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s4, v3
; GCN-NEXT: s_addc_u32 s3, s3, s12
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT: v_mul_lo_u32 v6, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT: v_mul_hi_u32 v8, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s5, v0
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13]
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_hi_u32 v1, s4, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s4, v2
+; GCN-NEXT: v_mul_lo_u32 v6, s5, v3
+; GCN-NEXT: v_mul_lo_u32 v0, s4, v3
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v6, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v5, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v2, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v2, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_lo_u32 v3, s4, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT: v_mul_hi_u32 v4, s2, v1
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v3, s2, v1
+; GCN-NEXT: v_mul_hi_u32 v4, s2, v0
+; GCN-NEXT: v_mul_hi_u32 v5, s2, v1
+; GCN-NEXT: v_mul_hi_u32 v2, s3, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s3, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
; GCN-NEXT: v_mul_hi_u32 v5, s3, v1
; GCN-NEXT: v_mul_lo_u32 v1, s3, v1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, s3, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v2, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s11, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
; GCN-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
+; GCN-NEXT: v_mul_hi_u32 v2, s10, v0
+; GCN-NEXT: v_mul_lo_u32 v3, s10, v1
+; GCN-NEXT: v_mul_lo_u32 v4, s11, v0
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GCN-NEXT: v_mul_lo_u32 v3, s10, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
@@ -239,118 +245,124 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
; GCN-NEXT: v_xor_b32_e32 v3, v5, v4
; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3
; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2
-; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
-; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v2, vcc
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
+; GCN-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc
; GCN-NEXT: v_madmk_f32 v5, v6, 0x4f800000, v5
; GCN-NEXT: v_rcp_f32_e32 v5, v5
; GCN-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GCN-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; GCN-NEXT: v_trunc_f32_e32 v6, v6
; GCN-NEXT: v_madmk_f32 v5, v6, 0xcf800000, v5
-; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GCN-NEXT: v_mul_hi_u32 v9, v7, v5
-; GCN-NEXT: v_mul_lo_u32 v10, v7, v6
-; GCN-NEXT: v_mul_lo_u32 v11, v8, v5
-; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; GCN-NEXT: v_mul_lo_u32 v10, v7, v5
-; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GCN-NEXT: v_mul_lo_u32 v11, v5, v9
-; GCN-NEXT: v_mul_hi_u32 v12, v5, v10
-; GCN-NEXT: v_mul_hi_u32 v13, v5, v9
-; GCN-NEXT: v_mul_hi_u32 v14, v6, v9
-; GCN-NEXT: v_mul_lo_u32 v9, v6, v9
+; GCN-NEXT: v_cvt_u32_f32_e32 v7, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v8, v6
+; GCN-NEXT: v_mul_hi_u32 v5, v9, v7
+; GCN-NEXT: v_mul_lo_u32 v6, v9, v8
+; GCN-NEXT: v_mul_lo_u32 v11, v10, v7
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GCN-NEXT: v_mul_lo_u32 v6, v9, v7
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; GCN-NEXT: v_mul_lo_u32 v11, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v12, v7, v6
+; GCN-NEXT: v_mul_hi_u32 v13, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v14, v8, v5
+; GCN-NEXT: v_mul_lo_u32 v5, v8, v5
; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc
-; GCN-NEXT: v_mul_lo_u32 v13, v6, v10
-; GCN-NEXT: v_mul_hi_u32 v10, v6, v10
+; GCN-NEXT: v_mul_lo_u32 v13, v8, v6
+; GCN-NEXT: v_mul_hi_u32 v6, v8, v6
; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GCN-NEXT: v_addc_u32_e32 v10, vcc, v12, v10, vcc
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, v12, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc
-; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc
+; GCN-NEXT: v_add_i32_e32 v12, vcc, v6, v5
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc
+; GCN-NEXT: v_lshl_b64 v[5:6], v[5:6], 32
+; GCN-NEXT: v_or_b32_e32 v5, v5, v12
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v5
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v6, vcc
+; GCN-NEXT: v_mul_lo_u32 v5, v9, v7
+; GCN-NEXT: v_mul_hi_u32 v6, v9, v7
+; GCN-NEXT: v_mul_lo_u32 v9, v9, v8
+; GCN-NEXT: v_mul_lo_u32 v10, v10, v7
+; GCN-NEXT: v_mul_hi_u32 v11, v8, v5
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10
; GCN-NEXT: v_mul_lo_u32 v9, v7, v6
; GCN-NEXT: v_mul_hi_u32 v10, v7, v5
-; GCN-NEXT: v_mul_lo_u32 v8, v8, v5
-; GCN-NEXT: v_mul_lo_u32 v7, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v12, v7, v6
+; GCN-NEXT: v_mul_lo_u32 v5, v8, v5
; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_mul_lo_u32 v11, v5, v8
-; GCN-NEXT: v_mul_hi_u32 v12, v5, v7
-; GCN-NEXT: v_mul_hi_u32 v13, v5, v8
-; GCN-NEXT: v_mul_hi_u32 v10, v6, v7
-; GCN-NEXT: v_mul_lo_u32 v7, v6, v7
-; GCN-NEXT: v_mul_hi_u32 v9, v6, v8
-; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc
-; GCN-NEXT: v_mul_lo_u32 v8, v6, v8
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v11, v7
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v10, vcc
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v7
-; GCN-NEXT: v_mul_lo_u32 v8, v0, v6
-; GCN-NEXT: v_mul_hi_u32 v9, v0, v5
-; GCN-NEXT: v_mul_hi_u32 v10, v0, v6
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GCN-NEXT: v_xor_b32_e32 v1, v1, v7
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT: v_mul_lo_u32 v10, v1, v5
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT: v_mul_hi_u32 v11, v1, v6
-; GCN-NEXT: v_mul_lo_u32 v6, v1, v6
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc
+; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v12, vcc
+; GCN-NEXT: v_mul_hi_u32 v12, v8, v6
+; GCN-NEXT: v_mul_lo_u32 v6, v8, v6
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v11, vcc
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v5, v6
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; GCN-NEXT: v_lshl_b64 v[5:6], v[5:6], 32
+; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GCN-NEXT: v_or_b32_e32 v5, v5, v10
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; GCN-NEXT: v_xor_b32_e32 v11, v0, v9
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v7, v5
+; GCN-NEXT: v_xor_b32_e32 v10, v1, v9
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v6, vcc
+; GCN-NEXT: v_mul_lo_u32 v6, v11, v1
+; GCN-NEXT: v_mul_hi_u32 v7, v11, v0
+; GCN-NEXT: v_mul_hi_u32 v8, v11, v1
+; GCN-NEXT: v_mul_hi_u32 v5, v10, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v10, v0
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT: v_mul_hi_u32 v8, v10, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v10, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v7, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v6
+; GCN-NEXT: v_mul_hi_u32 v5, v3, v0
+; GCN-NEXT: v_mul_lo_u32 v6, v3, v1
+; GCN-NEXT: v_mul_lo_u32 v7, v2, v0
; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v8, v3, v6
-; GCN-NEXT: v_mul_hi_u32 v9, v3, v5
-; GCN-NEXT: v_mul_lo_u32 v10, v2, v5
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_mul_lo_u32 v9, v3, v5
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GCN-NEXT: v_sub_i32_e32 v10, vcc, v1, v8
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
-; GCN-NEXT: v_subb_u32_e64 v9, s[4:5], v10, v2, vcc
-; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v3
-; GCN-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2
+; GCN-NEXT: v_mul_lo_u32 v6, v3, v0
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GCN-NEXT: v_sub_i32_e32 v7, vcc, v10, v5
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, v11, v6
+; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v7, v2, vcc
+; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v3
+; GCN-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2
; GCN-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3
-; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v2
-; GCN-NEXT: v_cndmask_b32_e64 v9, v11, v10, s[4:5]
-; GCN-NEXT: v_add_i32_e64 v10, s[4:5], 2, v5
-; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GCN-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v6, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
-; GCN-NEXT: v_add_i32_e64 v12, s[4:5], 1, v5
-; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
-; GCN-NEXT: v_addc_u32_e64 v13, s[4:5], 0, v6, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9
-; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v9, v13, v11, s[4:5]
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v10, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v9, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GCN-NEXT: v_xor_b32_e32 v2, v7, v4
-; GCN-NEXT: v_xor_b32_e32 v3, v0, v2
-; GCN-NEXT: v_xor_b32_e32 v0, v1, v2
+; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
+; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v2
+; GCN-NEXT: v_cndmask_b32_e64 v7, v11, v8, s[4:5]
+; GCN-NEXT: v_add_i32_e64 v8, s[4:5], 2, v0
+; GCN-NEXT: v_subb_u32_e32 v5, vcc, v10, v5, vcc
+; GCN-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v1, s[4:5]
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
+; GCN-NEXT: v_add_i32_e64 v12, s[4:5], 1, v0
+; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
+; GCN-NEXT: v_addc_u32_e64 v13, s[4:5], 0, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v2
+; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7
+; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, v12, v8, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v7, v13, v11, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT: v_xor_b32_e32 v2, v9, v4
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IR-LABEL: v_test_sdiv:
@@ -1209,58 +1221,62 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s5, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v6, v1, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s4, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s4, v3
+; GCN-NEXT: v_mul_lo_u32 v5, s5, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT: v_mul_hi_u32 v8, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s5, v0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_lo_u32 v3, s4, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_hi_u32 v1, s4, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s4, v2
+; GCN-NEXT: v_mul_lo_u32 v6, s5, v3
+; GCN-NEXT: v_mul_lo_u32 v0, s4, v3
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v6, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v5, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v2, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v2, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GCN-NEXT: v_mul_lo_u32 v2, v1, 24
; GCN-NEXT: v_mul_hi_u32 v0, v0, 24
; GCN-NEXT: v_mul_hi_u32 v1, v1, 24
-; GCN-NEXT: v_mov_b32_e32 v4, s3
-; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc
; GCN-NEXT: v_mul_lo_u32 v1, s3, v0
@@ -1388,105 +1404,109 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0
-; GCN-NEXT: v_cvt_f32_u32_e32 v4, v1
-; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0
-; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT: v_madmk_f32 v3, v4, 0x4f800000, v3
-; GCN-NEXT: v_rcp_f32_e32 v3, v3
-; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
-; GCN-NEXT: v_trunc_f32_e32 v4, v4
-; GCN-NEXT: v_madmk_f32 v3, v4, 0xcf800000, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GCN-NEXT: v_mul_hi_u32 v7, v5, v3
-; GCN-NEXT: v_mul_lo_u32 v8, v5, v4
-; GCN-NEXT: v_mul_lo_u32 v9, v6, v3
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GCN-NEXT: v_mul_lo_u32 v8, v5, v3
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GCN-NEXT: v_mul_lo_u32 v9, v3, v7
-; GCN-NEXT: v_mul_hi_u32 v10, v3, v8
-; GCN-NEXT: v_mul_hi_u32 v11, v3, v7
-; GCN-NEXT: v_mul_hi_u32 v12, v4, v7
-; GCN-NEXT: v_mul_lo_u32 v7, v4, v7
+; GCN-NEXT: v_xor_b32_e32 v3, v1, v2
+; GCN-NEXT: v_xor_b32_e32 v4, v0, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, v4
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v3
+; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc
+; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
+; GCN-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT: v_trunc_f32_e32 v1, v1
+; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v5, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v6, v1
+; GCN-NEXT: v_mul_hi_u32 v0, v7, v5
+; GCN-NEXT: v_mul_lo_u32 v1, v7, v6
+; GCN-NEXT: v_mul_lo_u32 v9, v8, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v7, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GCN-NEXT: v_mul_lo_u32 v9, v5, v0
+; GCN-NEXT: v_mul_hi_u32 v10, v5, v1
+; GCN-NEXT: v_mul_hi_u32 v11, v5, v0
+; GCN-NEXT: v_mul_hi_u32 v12, v6, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v6, v0
; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc
-; GCN-NEXT: v_mul_lo_u32 v11, v4, v8
-; GCN-NEXT: v_mul_hi_u32 v8, v4, v8
+; GCN-NEXT: v_mul_lo_u32 v11, v6, v1
+; GCN-NEXT: v_mul_hi_u32 v1, v6, v1
; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v8, vcc
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v9, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v10
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v0
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v7, v5
+; GCN-NEXT: v_mul_lo_u32 v7, v7, v6
+; GCN-NEXT: v_mul_lo_u32 v8, v8, v5
+; GCN-NEXT: v_mul_hi_u32 v9, v6, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v7
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v8
+; GCN-NEXT: v_mul_lo_u32 v7, v5, v1
+; GCN-NEXT: v_mul_hi_u32 v8, v5, v0
+; GCN-NEXT: v_mul_hi_u32 v10, v5, v1
+; GCN-NEXT: v_mul_lo_u32 v0, v6, v0
; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v7, v5, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v5, v3
-; GCN-NEXT: v_mul_lo_u32 v6, v6, v3
-; GCN-NEXT: v_mul_lo_u32 v5, v5, v3
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_mul_lo_u32 v9, v3, v6
-; GCN-NEXT: v_mul_hi_u32 v10, v3, v5
-; GCN-NEXT: v_mul_hi_u32 v11, v3, v6
-; GCN-NEXT: v_mul_hi_u32 v8, v4, v5
-; GCN-NEXT: v_mul_lo_u32 v5, v4, v5
-; GCN-NEXT: v_mul_hi_u32 v7, v4, v6
-; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v4, v6
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GCN-NEXT: v_mul_lo_u32 v5, v4, 24
-; GCN-NEXT: v_mul_hi_u32 v3, v3, 24
-; GCN-NEXT: v_mul_hi_u32 v4, v4, 24
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v5, v0, v3
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v3
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc
+; GCN-NEXT: v_mul_hi_u32 v10, v6, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v6, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v8, v9, vcc
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; GCN-NEXT: v_add_i32_e32 v8, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v7, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v8
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v5, v1, 24
+; GCN-NEXT: v_mul_hi_u32 v0, v0, 24
+; GCN-NEXT: v_mul_hi_u32 v1, v1, 24
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v1, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v5, v4, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; GCN-NEXT: v_mul_lo_u32 v5, v4, v0
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v1
; GCN-NEXT: v_sub_i32_e32 v5, vcc, 24, v5
-; GCN-NEXT: v_subb_u32_e64 v6, s[4:5], v6, v1, vcc
-; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v5, v0
+; GCN-NEXT: v_subb_u32_e64 v6, s[4:5], v6, v3, vcc
+; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v5, v4
; GCN-NEXT: v_subbrev_u32_e64 v6, s[4:5], 0, v6, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1
+; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0
+; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4
; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v1
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
; GCN-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[4:5]
-; GCN-NEXT: v_add_i32_e64 v7, s[4:5], 2, v3
+; GCN-NEXT: v_add_i32_e64 v7, s[4:5], 2, v0
; GCN-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, s[4:5]
-; GCN-NEXT: v_add_i32_e64 v9, s[4:5], 1, v3
+; GCN-NEXT: v_add_i32_e64 v9, s[4:5], 1, v0
; GCN-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5]
-; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc
; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; GCN-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e64 v1, v9, v7, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GCN-NEXT: v_xor_b32_e32 v3, v0, v2
-; GCN-NEXT: v_xor_b32_e32 v0, v1, v2
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
+; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v4, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IR-LABEL: v_test_sdiv_k_num_i64:
@@ -1585,101 +1605,106 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0
-; GCN-NEXT: v_cvt_f32_u32_e32 v4, v1
-; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0
-; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT: v_madmk_f32 v3, v4, 0x4f800000, v3
-; GCN-NEXT: v_rcp_f32_e32 v3, v3
-; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
-; GCN-NEXT: v_trunc_f32_e32 v4, v4
-; GCN-NEXT: v_madmk_f32 v3, v4, 0xcf800000, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GCN-NEXT: v_mul_hi_u32 v7, v5, v3
-; GCN-NEXT: v_mul_lo_u32 v8, v5, v4
-; GCN-NEXT: v_mul_lo_u32 v9, v6, v3
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GCN-NEXT: v_mul_lo_u32 v8, v5, v3
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GCN-NEXT: v_mul_lo_u32 v9, v3, v7
-; GCN-NEXT: v_mul_hi_u32 v10, v3, v8
-; GCN-NEXT: v_mul_hi_u32 v11, v3, v7
-; GCN-NEXT: v_mul_hi_u32 v12, v4, v7
-; GCN-NEXT: v_mul_lo_u32 v7, v4, v7
+; GCN-NEXT: v_xor_b32_e32 v3, v1, v2
+; GCN-NEXT: v_xor_b32_e32 v4, v0, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, v4
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v3
+; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc
+; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
+; GCN-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-NEXT: s_mov_b32 s4, 0x8000
+; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT: v_trunc_f32_e32 v1, v1
+; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v5, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v6, v1
+; GCN-NEXT: v_mul_hi_u32 v0, v7, v5
+; GCN-NEXT: v_mul_lo_u32 v1, v7, v6
+; GCN-NEXT: v_mul_lo_u32 v9, v8, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v7, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GCN-NEXT: v_mul_lo_u32 v9, v5, v0
+; GCN-NEXT: v_mul_hi_u32 v10, v5, v1
+; GCN-NEXT: v_mul_hi_u32 v11, v5, v0
+; GCN-NEXT: v_mul_hi_u32 v12, v6, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v6, v0
; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc
-; GCN-NEXT: v_mul_lo_u32 v11, v4, v8
-; GCN-NEXT: v_mul_hi_u32 v8, v4, v8
+; GCN-NEXT: v_mul_lo_u32 v11, v6, v1
+; GCN-NEXT: v_mul_hi_u32 v1, v6, v1
; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v8, vcc
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v9, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v10
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v0
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v7, v5
+; GCN-NEXT: v_mul_lo_u32 v7, v7, v6
+; GCN-NEXT: v_mul_lo_u32 v8, v8, v5
+; GCN-NEXT: v_mul_hi_u32 v9, v6, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v7
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v8
+; GCN-NEXT: v_mul_lo_u32 v7, v5, v1
+; GCN-NEXT: v_mul_hi_u32 v8, v5, v0
+; GCN-NEXT: v_mul_hi_u32 v10, v5, v1
+; GCN-NEXT: v_mul_lo_u32 v0, v6, v0
; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v7, v5, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v5, v3
-; GCN-NEXT: v_mul_lo_u32 v6, v6, v3
-; GCN-NEXT: v_mul_lo_u32 v5, v5, v3
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_mul_lo_u32 v9, v3, v6
-; GCN-NEXT: v_mul_hi_u32 v10, v3, v5
-; GCN-NEXT: v_mul_hi_u32 v11, v3, v6
-; GCN-NEXT: v_mul_hi_u32 v8, v4, v5
-; GCN-NEXT: v_mul_lo_u32 v5, v4, v5
-; GCN-NEXT: v_mul_hi_u32 v7, v4, v6
-; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v4, v6
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v6, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 17, v3
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v5, v0, v3
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v3
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
-; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0x8000, v5
-; GCN-NEXT: v_subb_u32_e64 v6, s[4:5], v6, v1, vcc
-; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v5, v0
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc
+; GCN-NEXT: v_mul_hi_u32 v10, v6, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v6, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v8, v9, vcc
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; GCN-NEXT: v_add_i32_e32 v8, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v7, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v8
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v1, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 17, v0
+; GCN-NEXT: v_mul_lo_u32 v1, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v5, v4, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; GCN-NEXT: v_mul_lo_u32 v5, v4, v0
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v1
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, s4, v5
+; GCN-NEXT: v_subb_u32_e64 v6, s[4:5], v6, v3, vcc
+; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v5, v4
; GCN-NEXT: v_subbrev_u32_e64 v6, s[4:5], 0, v6, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1
+; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0
+; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4
; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v1
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
; GCN-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[4:5]
-; GCN-NEXT: v_add_i32_e64 v7, s[4:5], 2, v3
+; GCN-NEXT: v_add_i32_e64 v7, s[4:5], 2, v0
; GCN-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, s[4:5]
-; GCN-NEXT: v_add_i32_e64 v9, s[4:5], 1, v3
+; GCN-NEXT: v_add_i32_e64 v9, s[4:5], 1, v0
; GCN-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5]
-; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc
; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; GCN-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e64 v1, v9, v7, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GCN-NEXT: v_xor_b32_e32 v3, v0, v2
-; GCN-NEXT: v_xor_b32_e32 v0, v1, v2
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
+; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v4, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IR-LABEL: v_test_sdiv_pow2_k_num_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index d2bb971b680307..b330d5d67f60e2 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -13,24 +13,30 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; CI-LABEL: add_select_fabs_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_or_b32_e32 v4, v4, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
-; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v6
+; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6|
+; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7|
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4|
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; CI-NEXT: v_add_f32_e32 v0, v0, v6
-; CI-NEXT: v_add_f32_e32 v1, v1, v7
+; CI-NEXT: v_add_f32_e32 v0, v0, v5
+; CI-NEXT: v_add_f32_e32 v1, v1, v3
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_fabs_v2f16:
@@ -94,30 +100,36 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x
; CI-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
-; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v4, v4, v5
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v8
+; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v4
+; CI-NEXT: v_cvt_f32_f16_e64 v8, |v8|
+; CI-NEXT: v_cvt_f32_f16_e64 v9, |v9|
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4|
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; CI-NEXT: v_add_f32_e32 v0, v0, v8
-; CI-NEXT: v_add_f32_e32 v1, v1, v9
-; CI-NEXT: v_add_f32_e32 v2, v2, v6
-; CI-NEXT: v_add_f32_e32 v3, v3, v7
+; CI-NEXT: v_add_f32_e32 v0, v0, v7
+; CI-NEXT: v_add_f32_e32 v1, v1, v6
+; CI-NEXT: v_add_f32_e32 v2, v2, v5
+; CI-NEXT: v_add_f32_e32 v3, v8, v3
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16:
@@ -189,24 +201,32 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1
; CI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_or_b32_e32 v4, v4, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
+; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
-; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_or_b32_e32 v7, v2, v7
+; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v4
+; CI-NEXT: v_cvt_f32_f16_e64 v8, |v8|
+; CI-NEXT: v_cvt_f32_f16_e64 v9, |v9|
+; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7|
; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4|
-; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
+; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; CI-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v6
-; CI-NEXT: v_add_f32_e32 v1, v1, v7
+; CI-NEXT: v_add_f32_e32 v1, v1, v5
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16:
@@ -275,30 +295,36 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x
; CI-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v4, v4, v5
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v8
; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
-; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v4
+; CI-NEXT: v_cvt_f32_f16_e64 v8, |v8|
+; CI-NEXT: v_cvt_f32_f16_e64 v9, |v9|
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4|
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v6
; CI-NEXT: v_add_f32_e32 v1, v1, v7
-; CI-NEXT: v_add_f32_e32 v2, v4, v8
-; CI-NEXT: v_add_f32_e32 v3, v5, v9
+; CI-NEXT: v_add_f32_e32 v2, v4, v5
+; CI-NEXT: v_add_f32_e32 v3, v9, v3
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16:
@@ -370,24 +396,27 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
; CI-LABEL: add_select_fabs_var_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v5
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v7
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7|
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v6
-; CI-NEXT: v_add_f32_e32 v1, v1, v7
+; CI-NEXT: v_add_f32_e32 v1, v1, v5
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_var_v2f16:
@@ -449,17 +478,20 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
-; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v3, vcc
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v5
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc
-; CI-NEXT: v_add_f32_e32 v0, v0, v4
-; CI-NEXT: v_add_f32_e32 v1, v1, v5
+; CI-NEXT: v_add_f32_e32 v0, v0, v3
+; CI-NEXT: v_add_f32_e32 v1, v1, v4
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_negk_v2f16:
@@ -518,14 +550,17 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; CI-LABEL: add_select_fabs_negk_negk_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v0, v0, v1
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
@@ -652,17 +687,20 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
-; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v3, vcc
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v5
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc
; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc
-; CI-NEXT: v_add_f32_e32 v0, v0, v4
-; CI-NEXT: v_add_f32_e32 v1, v1, v5
+; CI-NEXT: v_add_f32_e32 v0, v0, v3
+; CI-NEXT: v_add_f32_e32 v1, v1, v4
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_negk_fabs_v2f16:
@@ -724,17 +762,20 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v5
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_mov_b32_e32 v6, 0xc4800000
; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc
; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v4
-; CI-NEXT: v_add_f32_e32 v1, v1, v5
+; CI-NEXT: v_add_f32_e32 v1, v1, v3
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_negliteralk_fabs_v2f16:
@@ -795,17 +836,20 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
-; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v5
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; CI-NEXT: v_add_f32_e32 v0, v0, v4
-; CI-NEXT: v_add_f32_e32 v1, v1, v5
+; CI-NEXT: v_add_f32_e32 v0, v0, v3
+; CI-NEXT: v_add_f32_e32 v1, v1, v4
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_posk_v2f16:
@@ -866,17 +910,20 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
-; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v5
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc
; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; CI-NEXT: v_add_f32_e32 v0, v0, v4
-; CI-NEXT: v_add_f32_e32 v1, v1, v5
+; CI-NEXT: v_add_f32_e32 v0, v0, v3
+; CI-NEXT: v_add_f32_e32 v1, v1, v4
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_posk_fabs_v2f16:
@@ -1946,27 +1993,30 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; CI-LABEL: add_select_negfabs_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_or_b32_e32 v4, v4, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2
-; CI-NEXT: v_cvt_f32_f16_e32 v3, v7
-; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v6
+; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4|
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; CI-NEXT: v_add_f32_e32 v0, v0, v6
+; CI-NEXT: v_add_f32_e32 v0, v0, v5
; CI-NEXT: v_add_f32_e32 v1, v1, v3
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2034,26 +2084,29 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; CI-NEXT: v_or_b32_e32 v4, v4, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: v_or_b32_e32 v4, 0x80008000, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v6
+; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
+; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6|
; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; CI-NEXT: v_add_f32_e32 v0, v0, v6
-; CI-NEXT: v_add_f32_e32 v1, v1, v5
+; CI-NEXT: v_add_f32_e32 v0, v0, v5
+; CI-NEXT: v_add_f32_e32 v1, v1, v3
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_negfabs_v2f16:
@@ -2118,27 +2171,30 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
; CI-LABEL: add_select_neg_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_or_b32_e32 v4, v4, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; CI-NEXT: v_cvt_f32_f16_e32 v3, v7
-; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v6
+; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4|
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; CI-NEXT: v_add_f32_e32 v0, v0, v6
+; CI-NEXT: v_add_f32_e32 v0, v0, v5
; CI-NEXT: v_add_f32_e32 v1, v1, v3
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2205,26 +2261,29 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; CI-NEXT: v_or_b32_e32 v4, v4, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: v_xor_b32_e32 v4, 0x80008000, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v6
+; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
+; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6|
; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; CI-NEXT: v_add_f32_e32 v0, v0, v6
-; CI-NEXT: v_add_f32_e32 v1, v1, v5
+; CI-NEXT: v_add_f32_e32 v0, v0, v5
+; CI-NEXT: v_add_f32_e32 v1, v1, v3
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_neg_v2f16:
@@ -2288,24 +2347,27 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; CI-LABEL: add_select_neg_negfabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_or_b32_e32 v4, v4, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v7
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5|
+; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4|
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; CI-NEXT: v_sub_f32_e32 v0, v6, v0
-; CI-NEXT: v_sub_f32_e32 v1, v7, v1
+; CI-NEXT: v_sub_f32_e32 v1, v5, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_neg_negfabs_v2f16:
@@ -2366,24 +2428,27 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2
; CI-LABEL: add_select_negfabs_neg_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v5
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3|
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v7
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7|
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; CI-NEXT: v_sub_f32_e32 v0, v6, v0
-; CI-NEXT: v_sub_f32_e32 v1, v7, v1
+; CI-NEXT: v_sub_f32_e32 v1, v5, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_negfabs_neg_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index 1b2b9d68fff847..b371461fccd3ca 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -43,14 +43,16 @@ define amdgpu_kernel void @sext_in_reg_i1_i32(ptr addrspace(1) %out, i32 %in) #0
;
; EG-LABEL: sext_in_reg_i1_i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_INT * T1.X, KC0[2].Z, 0.0, 1,
+; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45)
%shl = shl i32 %in, 31
%sext = ashr i32 %shl, 31
store i32 %sext, ptr addrspace(1) %out
@@ -74,7 +76,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a,
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s2, s2, s3
-; SI-NEXT: s_sext_i32_i8 s2, s2
+; SI-NEXT: s_bfe_i32 s2, s2, 0x80000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
@@ -90,7 +92,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a,
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: s_mov_b32 s4, s0
; GFX89-NEXT: s_add_i32 s0, s2, s3
-; GFX89-NEXT: s_sext_i32_i8 s0, s0
+; GFX89-NEXT: s_bfe_i32 s0, s0, 0x80000
; GFX89-NEXT: s_mov_b32 s5, s1
; GFX89-NEXT: v_mov_b32_e32 v0, s0
; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -104,7 +106,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a,
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_add_i32 s0, s2, s3
-; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x80000
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -112,15 +114,17 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a,
;
; EG-LABEL: sext_in_reg_i8_to_i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45)
%c = add i32 %a, %b ; add to prevent folding into extload
%shl = shl i32 %c, 24
%ashr = ashr i32 %shl, 24
@@ -145,7 +149,7 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a,
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s2, s2, s3
-; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_bfe_i32 s2, s2, 0x100000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
@@ -161,7 +165,7 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a,
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: s_mov_b32 s4, s0
; GFX89-NEXT: s_add_i32 s0, s2, s3
-; GFX89-NEXT: s_sext_i32_i16 s0, s0
+; GFX89-NEXT: s_bfe_i32 s0, s0, 0x100000
; GFX89-NEXT: s_mov_b32 s5, s1
; GFX89-NEXT: v_mov_b32_e32 v0, s0
; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -175,7 +179,7 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a,
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_add_i32 s0, s2, s3
-; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x100000
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -183,13 +187,15 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a,
;
; EG-LABEL: sext_in_reg_i16_to_i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
%c = add i32 %a, %b ; add to prevent folding into extload
@@ -216,7 +222,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(ptr addrspace(1) %out, <1 x i
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s2, s2, s3
-; SI-NEXT: s_sext_i32_i8 s2, s2
+; SI-NEXT: s_bfe_i32 s2, s2, 0x80000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
@@ -232,7 +238,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(ptr addrspace(1) %out, <1 x i
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: s_mov_b32 s4, s0
; GFX89-NEXT: s_add_i32 s0, s2, s3
-; GFX89-NEXT: s_sext_i32_i8 s0, s0
+; GFX89-NEXT: s_bfe_i32 s0, s0, 0x80000
; GFX89-NEXT: s_mov_b32 s5, s1
; GFX89-NEXT: v_mov_b32_e32 v0, s0
; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -246,7 +252,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(ptr addrspace(1) %out, <1 x i
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_add_i32 s0, s2, s3
-; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x80000
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -254,15 +260,17 @@ define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(ptr addrspace(1) %out, <1 x i
;
; EG-LABEL: sext_in_reg_i8_to_v1i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45)
%c = add <1 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <1 x i32> %c, <i32 24>
%ashr = ashr <1 x i32> %shl, <i32 24>
@@ -280,11 +288,13 @@ define amdgpu_kernel void @sext_in_reg_i1_to_i64(ptr addrspace(1) %out, i64 %a,
; SI-LABEL: sext_in_reg_i1_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_load_dword s1, s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
-; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_lshl_b64 s[2:3], s[6:7], s1
+; SI-NEXT: s_lshl_b32 s1, s2, 31
+; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 63
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
@@ -301,7 +311,9 @@ define amdgpu_kernel void @sext_in_reg_i1_to_i64(ptr addrspace(1) %out, i64 %a,
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
-; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX9-NEXT: s_lshl_b32 s5, s4, 31
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 63
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -309,7 +321,7 @@ define amdgpu_kernel void @sext_in_reg_i1_to_i64(ptr addrspace(1) %out, i64 %a,
;
; EG-LABEL: sext_in_reg_i1_to_i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -320,9 +332,11 @@ define amdgpu_kernel void @sext_in_reg_i1_to_i64(ptr addrspace(1) %out, i64 %a,
; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T0.W, PS, PV.W, 0.0,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45)
; EG-NEXT: MOV * T0.Y, PV.X,
%c = shl i64 %a, %b
%shl = shl i64 %c, 63
@@ -341,11 +355,13 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i64(ptr addrspace(1) %out, i64 %a,
; SI-LABEL: sext_in_reg_i8_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_load_dword s1, s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
-; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_lshl_b64 s[2:3], s[6:7], s1
+; SI-NEXT: s_lshl_b32 s1, s2, 24
+; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 56
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
@@ -362,7 +378,9 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i64(ptr addrspace(1) %out, i64 %a,
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
-; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX9-NEXT: s_lshl_b32 s5, s4, 24
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 56
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -370,7 +388,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i64(ptr addrspace(1) %out, i64 %a,
;
; EG-LABEL: sext_in_reg_i8_to_i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -381,11 +399,13 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i64(ptr addrspace(1) %out, i64 %a,
; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T0.W, PS, PV.W, 0.0,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
-; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR * T0.Y, PV.W, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, T0.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45)
%c = shl i64 %a, %b
%shl = shl i64 %c, 56
%ashr = ashr i64 %shl, 56
@@ -404,11 +424,13 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i64(ptr addrspace(1) %out, i64 %a,
; SI-LABEL: sext_in_reg_i16_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_load_dword s1, s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
-; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_lshl_b64 s[2:3], s[6:7], s1
+; SI-NEXT: s_lshl_b32 s1, s2, 16
+; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 48
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
@@ -425,7 +447,9 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i64(ptr addrspace(1) %out, i64 %a,
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
-; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX9-NEXT: s_lshl_b32 s5, s4, 16
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 48
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -433,7 +457,7 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i64(ptr addrspace(1) %out, i64 %a,
;
; EG-LABEL: sext_in_reg_i16_to_i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -444,11 +468,13 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i64(ptr addrspace(1) %out, i64 %a,
; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T0.W, PS, PV.W, 0.0,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR * T0.Y, PV.W, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, T0.W, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
-; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
-; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
%c = shl i64 %a, %b
%shl = shl i64 %c, 48
%ashr = ashr i64 %shl, 48
@@ -470,7 +496,9 @@ define amdgpu_kernel void @sext_in_reg_i32_to_i64(ptr addrspace(1) %out, i64 %a,
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
-; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s0
+; SI-NEXT: s_ashr_i64 s[0:1], s[2:3], 32
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
@@ -487,7 +515,9 @@ define amdgpu_kernel void @sext_in_reg_i32_to_i64(ptr addrspace(1) %out, i64 %a,
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
-; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x200000
+; GFX9-NEXT: s_mov_b32 s6, 0
+; GFX9-NEXT: s_mov_b32 s7, s4
+; GFX9-NEXT: s_ashr_i64 s[4:5], s[6:7], 32
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -558,8 +588,9 @@ define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(ptr addrspace(1) %out, ptr ad
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
-; SI-NEXT: v_bfe_i32 v2, v2, 0, 1
-; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 31, v2
+; SI-NEXT: v_mov_b32_e32 v2, v1
+; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], 63
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -571,8 +602,9 @@ define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(ptr addrspace(1) %out, ptr ad
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
-; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 63, v[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -580,7 +612,7 @@ define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(ptr addrspace(1) %out, ptr ad
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 13, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -597,8 +629,11 @@ define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(ptr addrspace(1) %out, ptr ad
; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T1.X, PV.W, literal.x,
; EG-NEXT: MOV * T0.Y, PV.X,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -642,8 +677,9 @@ define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(ptr addrspace(1) %out, ptr ad
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
-; SI-NEXT: v_bfe_i32 v2, v2, 0, 8
-; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v2
+; SI-NEXT: v_mov_b32_e32 v2, v1
+; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], 56
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -655,8 +691,9 @@ define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(ptr addrspace(1) %out, ptr ad
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
-; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 56, v[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -664,7 +701,7 @@ define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(ptr addrspace(1) %out, ptr ad
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -681,12 +718,15 @@ define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(ptr addrspace(1) %out, ptr ad
; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR * T0.Y, PV.W, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, T1.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T1.X, PV.W, literal.x,
-; EG-NEXT: ASHR * T0.Y, PV.X, literal.y,
-; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -727,8 +767,9 @@ define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(ptr addrspace(1) %out, ptr a
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
-; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_mov_b32_e32 v2, v1
+; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], 48
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -740,8 +781,9 @@ define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(ptr addrspace(1) %out, ptr a
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
-; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 48, v[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -749,7 +791,7 @@ define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(ptr addrspace(1) %out, ptr a
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -766,12 +808,15 @@ define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(ptr addrspace(1) %out, ptr a
; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR * T0.Y, PV.W, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, T1.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T1.X, PV.W, literal.x,
-; EG-NEXT: ASHR * T0.Y, PV.X, literal.y,
-; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -806,23 +851,27 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(ptr addrspace(1) %out, ptr a
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
-; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_mov_b32_e32 v4, v2
+; SI-NEXT: v_ashr_i64 v[2:3], v[3:4], 32
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; GFX9-LABEL: v_sext_in_reg_i32_to_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 32, v[1:2]
+; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v_sext_in_reg_i32_to_i64:
@@ -1055,17 +1104,21 @@ define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(ptr addrspace(1) %out, <2 x
;
; EG-LABEL: sext_in_reg_v2i1_to_v2i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, 1,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].W, KC0[3].Y,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T1.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.Y, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45)
%c = add <2 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i32> %c, <i32 31, i32 31>
%ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
@@ -1133,21 +1186,29 @@ define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(ptr addrspace(1) %out, <4 x
;
; EG-LABEL: sext_in_reg_v4i1_to_v4i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: ADD_INT * T0.W, KC0[4].X, KC0[5].X,
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, 1,
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: ADD_INT * T1.W, KC0[3].W, KC0[4].W,
-; EG-NEXT: BFE_INT T0.Z, PS, 0.0, 1,
-; EG-NEXT: ADD_INT * T1.W, KC0[3].Z, KC0[4].Z,
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, 1,
-; EG-NEXT: ADD_INT * T1.W, KC0[3].Y, KC0[4].Y,
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.Z, KC0[3].Z, KC0[4].Z,
+; EG-NEXT: LSHL T1.W, PS, literal.x,
+; EG-NEXT: ASHR * T2.W, PV.W, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T2.Z, PV.W, literal.x,
+; EG-NEXT: ADD_INT T0.W, KC0[3].Y, KC0[4].Y,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T2.Y, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T2.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45)
%c = add <4 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
%ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
@@ -1173,8 +1234,8 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(ptr addrspace(1) %out, <2 x
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s2, s4, s6
; SI-NEXT: s_add_i32 s4, s5, s7
-; SI-NEXT: s_sext_i32_i8 s4, s4
-; SI-NEXT: s_sext_i32_i8 s5, s2
+; SI-NEXT: s_bfe_i32 s4, s4, 0x80000
+; SI-NEXT: s_bfe_i32 s5, s2, 0x80000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_mov_b32_e32 v1, s4
@@ -1190,8 +1251,8 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s0, s4, s6
; GFX9-NEXT: s_add_i32 s1, s5, s7
-; GFX9-NEXT: s_sext_i32_i8 s1, s1
-; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: s_bfe_i32 s1, s1, 0x80000
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x80000
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@@ -1199,18 +1260,21 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(ptr addrspace(1) %out, <2 x
;
; EG-LABEL: sext_in_reg_v2i8_to_v2i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].W, KC0[3].Y,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT T1.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.Y, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45)
%c = add <2 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i32> %c, <i32 24, i32 24>
%ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
@@ -1242,10 +1306,10 @@ define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out, <4 x
; SI-NEXT: s_add_i32 s4, s5, s9
; SI-NEXT: s_add_i32 s5, s6, s10
; SI-NEXT: s_add_i32 s6, s7, s11
-; SI-NEXT: s_sext_i32_i8 s6, s6
-; SI-NEXT: s_sext_i32_i8 s5, s5
-; SI-NEXT: s_sext_i32_i8 s4, s4
-; SI-NEXT: s_sext_i32_i8 s7, s2
+; SI-NEXT: s_bfe_i32 s6, s6, 0x80000
+; SI-NEXT: s_bfe_i32 s5, s5, 0x80000
+; SI-NEXT: s_bfe_i32 s4, s4, 0x80000
+; SI-NEXT: s_bfe_i32 s7, s2, 0x80000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_mov_b32_e32 v1, s4
@@ -1265,10 +1329,10 @@ define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out, <4 x
; GFX9-NEXT: s_add_i32 s5, s5, s9
; GFX9-NEXT: s_add_i32 s6, s6, s10
; GFX9-NEXT: s_add_i32 s7, s7, s11
-; GFX9-NEXT: s_sext_i32_i8 s7, s7
-; GFX9-NEXT: s_sext_i32_i8 s6, s6
-; GFX9-NEXT: s_sext_i32_i8 s5, s5
-; GFX9-NEXT: s_sext_i32_i8 s4, s4
+; GFX9-NEXT: s_bfe_i32 s7, s7, 0x80000
+; GFX9-NEXT: s_bfe_i32 s6, s6, 0x80000
+; GFX9-NEXT: s_bfe_i32 s5, s5, 0x80000
+; GFX9-NEXT: s_bfe_i32 s4, s4, 0x80000
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
@@ -1278,24 +1342,29 @@ define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out, <4 x
;
; EG-LABEL: sext_in_reg_v4i8_to_v4i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: ADD_INT * T0.W, KC0[4].X, KC0[5].X,
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: ADD_INT * T1.W, KC0[3].W, KC0[4].W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[3].Z, KC0[4].Z,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[3].Y, KC0[4].Y,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.Z, KC0[3].Z, KC0[4].Z,
+; EG-NEXT: LSHL T1.W, PS, literal.x,
+; EG-NEXT: ASHR * T2.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T2.Z, PV.W, literal.x,
+; EG-NEXT: ADD_INT T0.W, KC0[3].Y, KC0[4].Y,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T2.Y, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T2.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.y,
+; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45)
%c = add <4 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
%ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
@@ -1321,8 +1390,8 @@ define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(ptr addrspace(1) %out, <2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s2, s4, s6
; SI-NEXT: s_add_i32 s4, s5, s7
-; SI-NEXT: s_sext_i32_i16 s4, s4
-; SI-NEXT: s_sext_i32_i16 s5, s2
+; SI-NEXT: s_bfe_i32 s4, s4, 0x100000
+; SI-NEXT: s_bfe_i32 s5, s2, 0x100000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_mov_b32_e32 v1, s4
@@ -1338,8 +1407,8 @@ define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(ptr addrspace(1) %out, <2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s0, s4, s6
; GFX9-NEXT: s_add_i32 s1, s5, s7
-; GFX9-NEXT: s_sext_i32_i16 s1, s1
-; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_bfe_i32 s1, s1, 0x100000
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x100000
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@@ -1347,16 +1416,19 @@ define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(ptr addrspace(1) %out, <2
;
; EG-LABEL: sext_in_reg_v2i16_to_v2i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: ADD_INT T1.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.Y, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
%c = add <2 x i32> %a, %b ; add to prevent folding into extload
@@ -1584,8 +1656,8 @@ define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: ALU 17, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
@@ -1596,18 +1668,23 @@ define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out,
; EG-NEXT: MOV * T1.X, KC0[2].W,
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: ADD_INT * T0.W, T0.W, T1.W,
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: ADD_INT * T1.W, T0.Z, T1.Z,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T1.W, T0.Y, T1.Y,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T1.W, T0.X, T1.X,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.Z, T0.Y, T1.Y,
+; EG-NEXT: LSHL T1.W, PS, literal.x,
+; EG-NEXT: ASHR * T2.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T2.Z, PV.W, literal.x,
+; EG-NEXT: ADD_INT T0.W, T0.X, T1.X,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T2.Y, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T2.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.y,
+; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45)
%loada = load <4 x i32>, ptr addrspace(1) %a, align 16
%loadb = load <4 x i32>, ptr addrspace(1) %b, align 16
%c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -1683,8 +1760,8 @@ define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: ALU 17, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
@@ -1695,17 +1772,22 @@ define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(ptr addrspace(1) %out
; EG-NEXT: MOV * T1.X, KC0[2].W,
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: ADD_INT * T0.W, T0.W, T1.W,
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: ADD_INT * T1.W, T0.Z, T1.Z,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T1.W, T0.Y, T1.Y,
+; EG-NEXT: ADD_INT T0.Z, T0.Y, T1.Y,
+; EG-NEXT: LSHL T1.W, PS, literal.x,
+; EG-NEXT: ASHR * T2.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T1.W, T0.X, T1.X,
+; EG-NEXT: ASHR T2.Z, PV.W, literal.x,
+; EG-NEXT: ADD_INT T0.W, T0.X, T1.X,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: ASHR T2.Y, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T2.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
%loada = load <4 x i32>, ptr addrspace(1) %a, align 16
%loadb = load <4 x i32>, ptr addrspace(1) %b, align 16
@@ -2018,7 +2100,9 @@ define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrs
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bfe_i32 s4, s2, 0x10000
+; SI-NEXT: s_lshl_b32 s2, s2, 15
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_lshr_b32 s4, s2, 15
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -2058,7 +2142,7 @@ define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrs
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -2067,12 +2151,15 @@ define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrs
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, 1,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, PS, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.y,
+; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.x, PS,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 15(2.101948e-44), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
@@ -2106,7 +2193,9 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16(ptr addrspace(1) %out, ptr addrs
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bfe_i32 s4, s2, 0x20000
+; SI-NEXT: s_lshl_b32 s2, s2, 14
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_lshr_b32 s4, s2, 14
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -2146,7 +2235,7 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16(ptr addrspace(1) %out, ptr addrs
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -2155,14 +2244,15 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16(ptr addrspace(1) %out, ptr addrs
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL T0.W, T0.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 30(4.203895e-44), 3(4.203895e-45)
-; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
-; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.y,
+; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.x, PS,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 14(1.961818e-44), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
@@ -2196,7 +2286,9 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16(ptr addrspace(3) %out, ptr addrs
; SI-NEXT: buffer_load_ushort v1, v[0:1], s[0:3], 0 addr64
; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_bfe_i32 v1, v1, 0, 1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 15, v1
+; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 15, v1
; SI-NEXT: s_mov_b32 m0, -1
; SI-NEXT: ds_write_b16 v0, v1
; SI-NEXT: s_endpgm
@@ -2210,7 +2302,8 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16(ptr addrspace(3) %out, ptr addrs
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1
+; GFX9-NEXT: v_ashrrev_i16_e32 v1, 15, v1
; GFX9-NEXT: ds_write_b16 v0, v1
; GFX9-NEXT: s_endpgm
;
@@ -2221,9 +2314,14 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16(ptr addrspace(3) %out, ptr addrs
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: TEX 0 @0
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU 2, @42, KC0[CB0:0-32], KC1[]
-; EG-NEXT: BFE_INT T1.W, T0.X, 0.0, 1,
+; EG-NEXT: ALU 7, @42, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHL * T1.W, T0.X, literal.x,
+; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT * T1.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.W, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
; EG-NEXT: LDS_SHORT_WRITE * T0.W, T1.W,
; EG-NEXT: RETURN
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2265,7 +2363,9 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(ptr addrspace(3) %out, p
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, v1, v2
-; SI-NEXT: v_bfe_i32 v1, v1, 0, 1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 15, v1
+; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 15, v1
; SI-NEXT: s_mov_b32 m0, -1
; SI-NEXT: ds_write_b16 v0, v1
; SI-NEXT: s_endpgm
@@ -2282,7 +2382,8 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(ptr addrspace(3) %out, p
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-NEXT: v_lshlrev_b16_e32 v1, v2, v1
-; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1
+; GFX9-NEXT: v_ashrrev_i16_e32 v1, 15, v1
; GFX9-NEXT: ds_write_b16 v0, v1
; GFX9-NEXT: s_endpgm
;
@@ -2297,12 +2398,17 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(ptr addrspace(3) %out, p
; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W,
; EG-NEXT: TEX 0 @0
; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
-; EG-NEXT: ALU 5, @45, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 10, @45, KC0[CB0:0-32], KC1[]
; EG-NEXT: AND_INT * T1.W, T1.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, T0.X, PV.W,
-; EG-NEXT: BFE_INT T1.W, PV.W, 0.0, 1,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT * T1.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.W, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
; EG-NEXT: LDS_SHORT_WRITE * T0.W, T1.W,
; EG-NEXT: RETURN
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2337,7 +2443,9 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 %
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bfe_i32 s4, s2, 0x20000
+; SI-NEXT: s_lshl_b32 s2, s2, 14
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_lshr_b32 s4, s2, 14
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -2361,7 +2469,7 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 %
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -2370,14 +2478,15 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 %
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL T0.W, T0.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 30(4.203895e-44), 3(4.203895e-45)
-; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
-; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.y,
+; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.x, PS,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 14(1.961818e-44), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
@@ -2408,7 +2517,9 @@ define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sext_i32_i8 s4, s2
+; SI-NEXT: s_lshl_b32 s2, s2, 8
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_lshr_b32 s4, s2, 8
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -2432,21 +2543,24 @@ define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.y,
+; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.x, PS,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, PS, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
@@ -2477,7 +2591,9 @@ define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(ptr addrspace(1) %out, i16
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bfe_i32 s4, s2, 0xf0000
+; SI-NEXT: s_lshl_b32 s2, s2, 1
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_lshr_b32 s4, s2, 1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -2501,7 +2617,7 @@ define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(ptr addrspace(1) %out, i16
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -2510,14 +2626,13 @@ define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(ptr addrspace(1) %out, i16
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL T0.W, T0.X, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.X, 1,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 17(2.382207e-44), 3(4.203895e-45)
-; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
-; EG-NEXT: 17(2.382207e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
+; EG-NEXT: BFE_UINT T0.W, PV.W, 1, literal.x,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
+; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
@@ -2545,10 +2660,12 @@ define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(ptr addrspace(1) %out, <2 x
; SI-NEXT: s_lshr_b32 s5, s3, 16
; SI-NEXT: s_add_i32 s2, s2, s3
; SI-NEXT: s_add_i32 s4, s4, s5
-; SI-NEXT: s_bfe_i32 s2, s2, 0x10000
-; SI-NEXT: s_bfe_i32 s3, s4, 0x10000
-; SI-NEXT: s_and_b32 s2, s2, 0xffff
-; SI-NEXT: s_lshl_b32 s3, s3, 16
+; SI-NEXT: s_lshl_b32 s2, s2, 15
+; SI-NEXT: s_lshl_b32 s3, s4, 15
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_sext_i32_i16 s3, s3
+; SI-NEXT: s_bfe_u32 s2, s2, 0x10000f
+; SI-NEXT: s_lshl_b32 s3, s3, 1
; SI-NEXT: s_or_b32 s2, s2, s3
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
@@ -2576,7 +2693,7 @@ define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(ptr addrspace(1) %out, <2 x
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[], KC1[]
; EG-NEXT: TEX 3 @6
-; EG-NEXT: ALU 9, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 14, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -2590,12 +2707,17 @@ define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(ptr addrspace(1) %out, <2 x
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: ADD_INT * T0.W, T5.X, T6.X,
; EG-NEXT: ADD_INT * T1.W, T7.X, T4.X,
-; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, 1,
-; EG-NEXT: BFE_INT * T0.W, T0.W, 0.0, 1,
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.W, literal.x,
+; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: MOV * T1.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.x, PS,
; EG-NEXT: AND_INT * T1.W, PV.Z, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT T4.X, PV.W, PS,
+; EG-NEXT: 15(2.101948e-44), -65536(nan)
+; EG-NEXT: OR_INT T4.X, PS, PV.W,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i16> %a, %b ; add to prevent folding into extload
@@ -2622,16 +2744,20 @@ define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(ptr addrspace(1) %out, <3 x
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s8, s4, 16
; SI-NEXT: s_lshr_b32 s9, s6, 16
-; SI-NEXT: s_add_i32 s5, s5, s7
; SI-NEXT: s_add_i32 s4, s4, s6
+; SI-NEXT: s_add_i32 s5, s5, s7
; SI-NEXT: s_add_i32 s8, s8, s9
-; SI-NEXT: s_bfe_i32 s4, s4, 0x10000
-; SI-NEXT: s_bfe_i32 s5, s5, 0x10000
-; SI-NEXT: s_bfe_i32 s6, s8, 0x10000
-; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: s_lshl_b32 s5, s5, 15
+; SI-NEXT: s_lshl_b32 s4, s4, 15
+; SI-NEXT: s_lshl_b32 s6, s8, 15
+; SI-NEXT: s_sext_i32_i16 s5, s5
+; SI-NEXT: s_sext_i32_i16 s4, s4
+; SI-NEXT: s_sext_i32_i16 s6, s6
+; SI-NEXT: s_bfe_u32 s4, s4, 0x10000f
+; SI-NEXT: s_lshr_b32 s5, s5, 15
+; SI-NEXT: s_lshl_b32 s6, s6, 1
; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: s_lshl_b32 s5, s6, 16
-; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
@@ -2645,60 +2771,66 @@ define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(ptr addrspace(1) %out, <3 x
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_pk_add_u16 v0, s5, v0
-; GFX9-NEXT: v_pk_add_u16 v1, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_pk_add_u16 v1, s5, v1
+; GFX9-NEXT: v_pk_add_u16 v0, s4, v0
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, 15, v1
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_pk_lshlrev_b16 v1, 15, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v1, 15, v1
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_pk_ashrrev_i16 v1, 15, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sext_in_reg_v3i1_to_v3i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @18, KC0[], KC1[]
; EG-NEXT: TEX 5 @6
-; EG-NEXT: ALU 25, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 31, @19, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T8.X, 0
-; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X
+; EG-NEXT: MEM_RAT MSKOR T6.XW, T5.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
-; EG-NEXT: VTX_READ_16 T7.X, T5.X, 52, #3
-; EG-NEXT: VTX_READ_16 T8.X, T5.X, 46, #3
-; EG-NEXT: VTX_READ_16 T9.X, T5.X, 54, #3
-; EG-NEXT: VTX_READ_16 T10.X, T5.X, 48, #3
-; EG-NEXT: VTX_READ_16 T5.X, T5.X, 56, #3
+; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
+; EG-NEXT: VTX_READ_16 T7.X, T5.X, 56, #3
+; EG-NEXT: VTX_READ_16 T8.X, T5.X, 44, #3
+; EG-NEXT: VTX_READ_16 T9.X, T5.X, 52, #3
+; EG-NEXT: VTX_READ_16 T10.X, T5.X, 46, #3
+; EG-NEXT: VTX_READ_16 T5.X, T5.X, 54, #3
; EG-NEXT: ALU clause starting at 18:
; EG-NEXT: MOV * T5.X, 0.0,
; EG-NEXT: ALU clause starting at 19:
-; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T1.W, T10.X, T5.X,
-; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T1.W, PS, 0.0, 1,
-; EG-NEXT: AND_INT * T2.W, PV.W, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T2.W, PS, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
-; EG-NEXT: LSHL T5.X, PV.W, PS,
-; EG-NEXT: LSHL * T5.W, literal.x, PS,
+; EG-NEXT: ADD_INT * T0.W, T6.X, T7.X,
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 15(2.101948e-44), 4(5.605194e-45)
+; EG-NEXT: AND_INT T0.Z, PS, literal.x,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.y,
+; EG-NEXT: MOV * T2.W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.x, PS,
+; EG-NEXT: LSHL * T3.W, PV.Z, literal.y,
+; EG-NEXT: 15(2.101948e-44), 3(4.203895e-45)
+; EG-NEXT: LSHL T6.X, PV.W, PS,
+; EG-NEXT: LSHL * T6.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV T5.Z, 0.0,
-; EG-NEXT: ADD_INT * T1.W, T8.X, T9.X,
-; EG-NEXT: ADD_INT * T2.W, T6.X, T7.X,
-; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, 1,
-; EG-NEXT: BFE_INT * T1.W, T1.W, 0.0, 1,
-; EG-NEXT: LSHR T6.X, T0.W, literal.x,
-; EG-NEXT: AND_INT T0.W, PV.W, literal.y,
+; EG-NEXT: MOV T6.Y, 0.0,
+; EG-NEXT: MOV T6.Z, 0.0,
+; EG-NEXT: ADD_INT * T0.W, T10.X, T5.X,
+; EG-NEXT: ADD_INT * T3.W, T8.X, T9.X,
+; EG-NEXT: LSHL T3.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.W, literal.x,
+; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T5.X, T1.W, literal.x,
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.y, T2.W,
; EG-NEXT: AND_INT * T1.W, PV.Z, literal.z,
-; EG-NEXT: 2(2.802597e-45), -65536(nan)
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT T7.X, PV.W, PS,
+; EG-NEXT: 2(2.802597e-45), 15(2.101948e-44)
+; EG-NEXT: -65536(nan), 0(0.000000e+00)
+; EG-NEXT: OR_INT T7.X, PS, PV.W,
; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <3 x i16> %a, %b ; add to prevent folding into extload
@@ -2722,10 +2854,12 @@ define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(ptr addrspace(1) %out, <2 x
; SI-NEXT: s_lshr_b32 s5, s3, 16
; SI-NEXT: s_add_i32 s2, s2, s3
; SI-NEXT: s_add_i32 s4, s4, s5
-; SI-NEXT: s_bfe_i32 s2, s2, 0x20000
-; SI-NEXT: s_bfe_i32 s3, s4, 0x20000
-; SI-NEXT: s_and_b32 s2, s2, 0xffff
-; SI-NEXT: s_lshl_b32 s3, s3, 16
+; SI-NEXT: s_lshl_b32 s2, s2, 14
+; SI-NEXT: s_lshl_b32 s3, s4, 14
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_sext_i32_i16 s3, s3
+; SI-NEXT: s_bfe_u32 s2, s2, 0x10000e
+; SI-NEXT: s_lshl_b32 s3, s3, 2
; SI-NEXT: s_or_b32 s2, s2, s3
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
@@ -2753,15 +2887,15 @@ define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(ptr addrspace(1) %out, <2 x
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[], KC1[]
; EG-NEXT: TEX 3 @6
-; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 14, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T5.X, T4.X, 40, #3
-; EG-NEXT: VTX_READ_16 T6.X, T4.X, 44, #3
-; EG-NEXT: VTX_READ_16 T7.X, T4.X, 42, #3
-; EG-NEXT: VTX_READ_16 T4.X, T4.X, 46, #3
+; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
+; EG-NEXT: VTX_READ_16 T6.X, T4.X, 46, #3
+; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3
+; EG-NEXT: VTX_READ_16 T4.X, T4.X, 44, #3
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T4.X, 0.0,
; EG-NEXT: ALU clause starting at 15:
@@ -2769,14 +2903,15 @@ define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(ptr addrspace(1) %out, <2 x
; EG-NEXT: ADD_INT * T1.W, T7.X, T4.X,
; EG-NEXT: LSHL T1.W, PV.W, literal.x,
; EG-NEXT: LSHL * T0.W, T0.W, literal.x,
-; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
-; EG-NEXT: ASHR T0.W, PS, literal.x,
-; EG-NEXT: ASHR * T1.W, PV.W, literal.x,
-; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
-; EG-NEXT: LSHL T1.W, PS, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT T4.X, PV.W, PS,
+; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: MOV * T1.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.x, PS,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 14(1.961818e-44), 2(2.802597e-45)
+; EG-NEXT: OR_INT T4.X, PS, PV.W,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i16> %a, %b ; add to prevent folding into extload
@@ -2800,10 +2935,12 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(ptr addrspace(1) %out, <2 x
; SI-NEXT: s_lshr_b32 s5, s3, 16
; SI-NEXT: s_add_i32 s2, s2, s3
; SI-NEXT: s_add_i32 s4, s4, s5
-; SI-NEXT: s_sext_i32_i8 s2, s2
-; SI-NEXT: s_sext_i32_i8 s3, s4
-; SI-NEXT: s_and_b32 s2, s2, 0xffff
-; SI-NEXT: s_lshl_b32 s3, s3, 16
+; SI-NEXT: s_lshl_b32 s2, s2, 8
+; SI-NEXT: s_lshl_b32 s3, s4, 8
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_sext_i32_i16 s3, s3
+; SI-NEXT: s_bfe_u32 s2, s2, 0x100008
+; SI-NEXT: s_lshl_b32 s3, s3, 8
; SI-NEXT: s_or_b32 s2, s2, s3
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
@@ -2831,7 +2968,7 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(ptr addrspace(1) %out, <2 x
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[], KC1[]
; EG-NEXT: TEX 3 @6
-; EG-NEXT: ALU 10, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 14, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -2845,13 +2982,17 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(ptr addrspace(1) %out, <2 x
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: ADD_INT * T0.W, T5.X, T6.X,
; EG-NEXT: ADD_INT * T1.W, T7.X, T4.X,
-; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT: LSHL T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: LSHL T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT T4.X, PV.W, PS,
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: MOV * T1.W, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.x, PS,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T4.X, PS, PV.W,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i16> %a, %b ; add to prevent folding into extload
@@ -2878,16 +3019,20 @@ define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(ptr addrspace(1) %out, <3 x
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s8, s4, 16
; SI-NEXT: s_lshr_b32 s9, s6, 16
-; SI-NEXT: s_add_i32 s5, s5, s7
; SI-NEXT: s_add_i32 s4, s4, s6
+; SI-NEXT: s_add_i32 s5, s5, s7
; SI-NEXT: s_add_i32 s8, s8, s9
-; SI-NEXT: s_sext_i32_i8 s4, s4
-; SI-NEXT: s_sext_i32_i8 s5, s5
-; SI-NEXT: s_sext_i32_i8 s6, s8
-; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: s_lshl_b32 s5, s5, 8
+; SI-NEXT: s_lshl_b32 s4, s4, 8
+; SI-NEXT: s_lshl_b32 s6, s8, 8
+; SI-NEXT: s_sext_i32_i16 s5, s5
+; SI-NEXT: s_sext_i32_i16 s4, s4
+; SI-NEXT: s_sext_i32_i16 s6, s6
+; SI-NEXT: s_bfe_u32 s4, s4, 0x100008
+; SI-NEXT: s_lshr_b32 s5, s5, 8
+; SI-NEXT: s_lshl_b32 s6, s6, 8
; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: s_lshl_b32 s5, s6, 16
-; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
@@ -2901,61 +3046,65 @@ define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(ptr addrspace(1) %out, <3 x
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_pk_add_u16 v0, s5, v0
-; GFX9-NEXT: v_pk_add_u16 v1, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_pk_add_u16 v1, s5, v1
+; GFX9-NEXT: v_pk_add_u16 v0, s4, v0
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sext_in_reg_v3i8_to_v3i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @18, KC0[], KC1[]
; EG-NEXT: TEX 5 @6
-; EG-NEXT: ALU 26, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 30, @19, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T8.X, 0
-; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X
+; EG-NEXT: MEM_RAT MSKOR T6.XW, T5.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
-; EG-NEXT: VTX_READ_16 T7.X, T5.X, 52, #3
-; EG-NEXT: VTX_READ_16 T8.X, T5.X, 46, #3
-; EG-NEXT: VTX_READ_16 T9.X, T5.X, 54, #3
-; EG-NEXT: VTX_READ_16 T10.X, T5.X, 48, #3
-; EG-NEXT: VTX_READ_16 T5.X, T5.X, 56, #3
+; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
+; EG-NEXT: VTX_READ_16 T7.X, T5.X, 56, #3
+; EG-NEXT: VTX_READ_16 T8.X, T5.X, 44, #3
+; EG-NEXT: VTX_READ_16 T9.X, T5.X, 52, #3
+; EG-NEXT: VTX_READ_16 T10.X, T5.X, 46, #3
+; EG-NEXT: VTX_READ_16 T5.X, T5.X, 54, #3
; EG-NEXT: ALU clause starting at 18:
; EG-NEXT: MOV * T5.X, 0.0,
; EG-NEXT: ALU clause starting at 19:
-; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T1.W, T10.X, T5.X,
-; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x,
-; EG-NEXT: AND_INT * T2.W, PV.W, literal.y,
+; EG-NEXT: ADD_INT * T0.W, T6.X, T7.X,
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 4(5.605194e-45)
+; EG-NEXT: AND_INT T0.Z, PS, literal.x,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.y,
+; EG-NEXT: MOV * T2.W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.x, PS,
+; EG-NEXT: LSHL * T3.W, PV.Z, literal.y,
; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
-; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T2.W, PS, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
-; EG-NEXT: LSHL T5.X, PV.W, PS,
-; EG-NEXT: LSHL * T5.W, literal.x, PS,
+; EG-NEXT: LSHL T6.X, PV.W, PS,
+; EG-NEXT: LSHL * T6.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV T5.Z, 0.0,
-; EG-NEXT: ADD_INT * T1.W, T8.X, T9.X,
-; EG-NEXT: ADD_INT * T2.W, T6.X, T7.X,
-; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T1.W, T1.W, 0.0, literal.x,
+; EG-NEXT: MOV T6.Y, 0.0,
+; EG-NEXT: MOV T6.Z, 0.0,
+; EG-NEXT: ADD_INT * T0.W, T10.X, T5.X,
+; EG-NEXT: ADD_INT * T3.W, T8.X, T9.X,
+; EG-NEXT: LSHL T3.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T6.X, T0.W, literal.x,
-; EG-NEXT: LSHL T0.W, PV.W, literal.y,
-; EG-NEXT: AND_INT * T1.W, PV.Z, literal.z,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT T7.X, PV.W, PS,
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T5.X, T1.W, literal.x,
+; EG-NEXT: BFE_UINT T0.W, PV.W, literal.y, T2.W,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: OR_INT T7.X, PS, PV.W,
; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <3 x i16> %a, %b ; add to prevent folding into extload
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
index d55e201394a318..b747511c0fc7f5 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -144,9 +144,10 @@ define amdgpu_kernel void @lshr_and_i64_35(ptr addrspace(1) %out, ptr addrspace(
; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_bfe_u32 v0, v0, 8, 23
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fffff00, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 40
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll b/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
index 7df9ff34f4feec..ef770c5e04c863 100644
--- a/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
@@ -14,8 +14,8 @@ define amdgpu_kernel void @add_const_offset(ptr addrspace(1) nocapture %arg) {
; CHECK-LABEL: add_const_offset:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; CHECK-NEXT: v_add_u32_e32 v0, vcc, 0xc8, v0
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NEXT: v_add_u32_e32 v0, vcc, 0xc80, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -46,8 +46,8 @@ define amdgpu_kernel void @or_const_offset(ptr addrspace(1) nocapture %arg) {
; CHECK-LABEL: or_const_offset:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; CHECK-NEXT: v_or_b32_e32 v0, 0x100, v0
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NEXT: v_or_b32_e32 v0, 0x1000, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index c440392153adbd..db967c09dde05d 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -585,27 +585,25 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; EG-LABEL: shl_v2i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @10
-; EG-NEXT: ALU 11, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 13, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 8:
+; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1
-; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
-; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV T7.X, KC0[2].Z,
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: MOV * T7.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: AND_INT T0.Z, T0.X, literal.x,
-; EG-NEXT: LSHR T0.W, T0.X, literal.y,
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: MOV * T2.X, T0.X,
+; EG-NEXT: MOV * T0.X, PV.X,
+; EG-NEXT: AND_INT T0.Z, PV.X, literal.x,
+; EG-NEXT: LSHR T0.W, PV.X, literal.y,
; EG-NEXT: LSHR * T1.W, T7.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: LSHL T0.W, PS, PV.W,
@@ -681,51 +679,59 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; EG-LABEL: shl_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 42, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 51, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV T0.Y, T6.X,
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: AND_INT * T1.W, T10.Z, literal.x,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: MOV T4.X, T10.X,
+; EG-NEXT: MOV * T5.X, T10.Y,
+; EG-NEXT: MOV T0.X, PS,
+; EG-NEXT: MOV * T2.X, T10.Z,
+; EG-NEXT: MOV T3.X, T10.W,
+; EG-NEXT: MOV T0.Y, T6.X,
+; EG-NEXT: MOV * T0.Z, PS,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, T10.X, PV.W,
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T2.W, T0.Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
; EG-NEXT: OR_INT * T1.W, PS, PV.W,
-; EG-NEXT: MOV * T6.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: LSHR T1.W, T10.Z, literal.x,
+; EG-NEXT: MOV * T0.Y, T3.X,
+; EG-NEXT: MOV * T6.X, T1.W,
+; EG-NEXT: MOV T1.Y, PV.X,
+; EG-NEXT: LSHR T1.W, T0.Z, literal.x,
; EG-NEXT: LSHR * T2.W, T10.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHL T1.W, PS, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
+; EG-NEXT: LSHL * T1.W, PS, PV.W,
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T2.W, T1.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
; EG-NEXT: MOV T6.X, PV.W,
-; EG-NEXT: MOV * T0.X, T7.X,
-; EG-NEXT: AND_INT * T1.W, T10.W, literal.x,
+; EG-NEXT: MOV T0.Z, T7.X,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL T1.W, T10.Y, PV.W,
-; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT * T1.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
; EG-NEXT: MOV * T7.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: LSHR T1.W, T10.W, literal.x,
+; EG-NEXT: LSHR T1.W, T0.Y, literal.x,
; EG-NEXT: LSHR * T2.W, T10.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PS, PV.W,
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index b81af3eb838f1f..695307434c18d6 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -679,13 +679,9 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v3
-; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
-; CI-NEXT: v_and_b32_e32 v3, 0xff00, v3
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT: v_and_b32_e32 v4, 0xff00, v4
-; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_or_b32_e32 v3, v4, v3
+; CI-NEXT: v_and_b32_e32 v3, 0xff00ff00, v3
; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 2968a63b150ad2..7719b91e0eb9bf 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -377,18 +377,6 @@ define void @shl_add_ptr_combine_2use_both_max_lds_offset(i32 %idx) #0 {
; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16
; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen offset:32
define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 {
-; GCN-LABEL: shl_add_ptr_combine_2use_private:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GCN-NEXT: v_mov_b32_e32 v2, 9
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:16
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, 10
-; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
%idx = zext i16 %idx.arg to i32
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 2
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 1a55bf608ebf51..cbf5ba3c687a22 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1645,9 +1645,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff0000
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -1773,9 +1775,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_add_i32_e32 v3, vcc, -7, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff0000
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, -7, v2
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -1914,9 +1918,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff0000
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xff850000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -2056,9 +2062,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_add_i32_e32 v3, vcc, -7, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v3, v2
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, -7, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v2, v3
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
;
@@ -2091,11 +2099,12 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; VI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-SDAG-NEXT: v_add_u16_e32 v3, -7, v3
; VI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2181,6 +2190,10 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -2303,6 +2316,10 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x3c000000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -2438,6 +2455,10 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xbc000000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -2574,9 +2595,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff0000
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -2702,6 +2725,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -2824,9 +2851,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v3, v2
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v2, v3
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
;
@@ -2859,11 +2888,12 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; VI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-SDAG-NEXT: v_subrev_u16_e32 v3, 32, v3
; VI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2949,9 +2979,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_add_i32_e32 v3, vcc, -16, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff0000
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, -16, v2
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -3077,6 +3109,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -3199,9 +3235,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_add_i32_e32 v3, vcc, -16, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v3, v2
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, -16, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v2, v3
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
;
@@ -3234,11 +3272,12 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; VI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-SDAG-NEXT: v_add_u16_e32 v3, -16, v3
; VI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3323,9 +3362,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_add_i32_e32 v3, vcc, 0xffffc400, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff0000
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xffffc400, v2
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xc4000000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -3488,9 +3529,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_add_i32_e32 v3, vcc, 0x4400, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff0000
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x4400, v2
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x44000000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -3653,9 +3696,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_add_i32_e32 v3, vcc, 0x4000, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff0000
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x4000, v2
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 2.0, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -3781,9 +3826,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_add_i32_e32 v3, vcc, 0xffffc000, v2
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xffff0000
-; SI-SDAG-NEXT: v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xffffc000, v2
+; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-SDAG-NEXT: v_or_b32_e32 v2, v3, v2
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, -2.0, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
@@ -3909,7 +3956,8 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index ae0221b8b32b33..eae34bce76cd7e 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -206,7 +206,7 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -215,12 +215,16 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T6.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T0.W, T6.X, literal.x,
+; EG-NEXT: MOV * T3.X, T6.X,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: MOV * T2.X, T6.Y,
+; EG-NEXT: MOV T0.Z, PV.X,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Z, T6.Y, literal.x,
-; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, T6.Y, literal.y,
+; EG-NEXT: BFE_INT T1.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR T1.Z, PV.Z, literal.x,
+; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: ASHR T0.W, PV.W, PS,
; EG-NEXT: ASHR * T1.W, PV.Y, PV.Z,
@@ -323,43 +327,53 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; EG-LABEL: ashr_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 48, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 59, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T6.X,
; EG-NEXT: MOV * T9.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: BFE_INT T0.W, T9.X, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: ASHR * T0.W, PV.W, PS,
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: MOV T4.X, T9.X,
+; EG-NEXT: MOV * T5.X, T9.Y,
+; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: MOV * T0.Z, PS,
+; EG-NEXT: MOV T2.X, T9.Z,
+; EG-NEXT: MOV * T3.X, T9.W,
+; EG-NEXT: MOV * T0.W, T6.X,
+; EG-NEXT: MOV T1.Y, T2.X,
+; EG-NEXT: BFE_INT * T1.W, T0.Y, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: ASHR * T1.W, T1.W, PV.W,
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T6.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T9.X, literal.x,
+; EG-NEXT: MOV * T1.Z, T3.X,
+; EG-NEXT: MOV * T6.X, T0.W,
+; EG-NEXT: MOV T0.W, PV.X,
+; EG-NEXT: LSHR * T1.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T9.Z, literal.x,
+; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x,
+; EG-NEXT: LSHR * T2.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ASHR T0.W, PV.W, PS,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: ASHR * T1.W, PV.W, PS,
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
+; EG-NEXT: OR_INT * T0.W, T0.W, PV.W,
; EG-NEXT: MOV T6.X, PV.W,
; EG-NEXT: MOV T0.Y, T7.X,
-; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.W, literal.y,
+; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x,
+; EG-NEXT: AND_INT * T1.W, T1.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: ASHR T0.W, PV.W, PS,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
@@ -369,10 +383,10 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T7.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T9.Y, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T9.W, literal.x,
+; EG-NEXT: LSHR * T1.W, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T0.W, PV.W, PS,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 126b17e718b59f..72294830b14615 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -24,9 +24,9 @@ define i1 @test_srem_even(i4 %X) nounwind {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_bfe_i32 v1, v0, 0, 4
; CHECK-NEXT: v_mul_i32_i24_e32 v1, 3, v1
-; CHECK-NEXT: v_lshrrev_b32_e32 v2, 4, v1
-; CHECK-NEXT: v_bfe_u32 v1, v1, 7, 1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 4, v1
+; CHECK-NEXT: v_bfe_u32 v2, v1, 3, 1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_and_b32_e32 v1, 15, v1
; CHECK-NEXT: v_mul_u32_u24_e32 v1, 6, v1
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
@@ -44,9 +44,11 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_bfe_i32 v1, v0, 0, 6
-; CHECK-NEXT: v_bfe_u32 v1, v1, 9, 2
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 5, v1
+; CHECK-NEXT: v_bfe_u32 v1, v1, 4, 2
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v0, v1
-; CHECK-NEXT: v_and_b32_e32 v1, 60, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 2, v1
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_and_b32_e32 v0, 63, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ed7f27b367fdaf..b546aa0ac4edb7 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -12,8 +12,8 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13
-; GCN-NEXT: s_sub_u32 s0, 0, s12
-; GCN-NEXT: s_subb_u32 s1, 0, s13
+; GCN-NEXT: s_sub_u32 s2, 0, s12
+; GCN-NEXT: s_subb_u32 s3, 0, s13
; GCN-NEXT: s_mov_b32 s4, s8
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
@@ -22,68 +22,74 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_mul_lo_u32 v2, s0, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s0, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s1, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v6, v1, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s2, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s2, v3
+; GCN-NEXT: v_mul_lo_u32 v5, s3, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT: v_mul_hi_u32 v8, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s0, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s0, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s1, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_lo_u32 v3, s0, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT: v_mul_hi_u32 v4, s10, v1
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_hi_u32 v1, s2, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s2, v2
+; GCN-NEXT: v_mul_lo_u32 v6, s3, v3
+; GCN-NEXT: v_mul_lo_u32 v0, s2, v3
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v6, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v5, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v2, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v2, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v3, s10, v1
+; GCN-NEXT: v_mul_hi_u32 v4, s10, v0
+; GCN-NEXT: v_mul_hi_u32 v5, s10, v1
+; GCN-NEXT: v_mul_hi_u32 v2, s11, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s11, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
; GCN-NEXT: v_mul_hi_u32 v5, s11, v1
; GCN-NEXT: v_mul_lo_u32 v1, s11, v1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, s11, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v2, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT: v_mul_lo_u32 v1, s12, v1
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
; GCN-NEXT: v_mul_hi_u32 v2, s12, v0
+; GCN-NEXT: v_mul_lo_u32 v1, s12, v1
; GCN-NEXT: v_mul_lo_u32 v3, s13, v0
; GCN-NEXT: v_mul_lo_u32 v0, s12, v0
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
@@ -218,116 +224,122 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
; GCN-NEXT: v_xor_b32_e32 v3, v5, v4
; GCN-NEXT: v_cvt_f32_u32_e32 v4, v3
; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
-; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v2, vcc
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
+; GCN-NEXT: v_subb_u32_e32 v9, vcc, 0, v2, vcc
; GCN-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4
; GCN-NEXT: v_rcp_f32_e32 v4, v4
; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; GCN-NEXT: v_trunc_f32_e32 v5, v5
; GCN-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4
-; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT: v_mul_hi_u32 v8, v6, v4
-; GCN-NEXT: v_mul_lo_u32 v9, v6, v5
-; GCN-NEXT: v_mul_lo_u32 v10, v7, v4
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GCN-NEXT: v_mul_lo_u32 v9, v6, v4
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GCN-NEXT: v_mul_lo_u32 v10, v4, v8
-; GCN-NEXT: v_mul_hi_u32 v11, v4, v9
-; GCN-NEXT: v_mul_hi_u32 v12, v4, v8
-; GCN-NEXT: v_mul_hi_u32 v13, v5, v8
-; GCN-NEXT: v_mul_lo_u32 v8, v5, v8
+; GCN-NEXT: v_cvt_u32_f32_e32 v6, v4
+; GCN-NEXT: v_cvt_u32_f32_e32 v7, v5
+; GCN-NEXT: v_mul_hi_u32 v4, v8, v6
+; GCN-NEXT: v_mul_lo_u32 v5, v8, v7
+; GCN-NEXT: v_mul_lo_u32 v10, v9, v6
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GCN-NEXT: v_mul_lo_u32 v5, v8, v6
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; GCN-NEXT: v_mul_lo_u32 v10, v6, v4
+; GCN-NEXT: v_mul_hi_u32 v11, v6, v5
+; GCN-NEXT: v_mul_hi_u32 v12, v6, v4
+; GCN-NEXT: v_mul_hi_u32 v13, v7, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v7, v4
; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT: v_mul_lo_u32 v12, v5, v9
-; GCN-NEXT: v_mul_hi_u32 v9, v5, v9
+; GCN-NEXT: v_mul_lo_u32 v12, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v5, v7, v5
; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc
; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc
+; GCN-NEXT: v_add_i32_e32 v11, vcc, v5, v4
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], 32
+; GCN-NEXT: v_or_b32_e32 v4, v4, v11
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v5, vcc
+; GCN-NEXT: v_mul_lo_u32 v4, v8, v6
+; GCN-NEXT: v_mul_hi_u32 v5, v8, v6
+; GCN-NEXT: v_mul_lo_u32 v8, v8, v7
+; GCN-NEXT: v_mul_lo_u32 v9, v9, v6
+; GCN-NEXT: v_mul_hi_u32 v10, v7, v4
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; GCN-NEXT: v_mul_lo_u32 v8, v6, v5
; GCN-NEXT: v_mul_hi_u32 v9, v6, v4
-; GCN-NEXT: v_mul_lo_u32 v7, v7, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v6, v4
+; GCN-NEXT: v_mul_hi_u32 v11, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v4, v7, v4
; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT: v_mul_lo_u32 v10, v4, v7
-; GCN-NEXT: v_mul_hi_u32 v11, v4, v6
-; GCN-NEXT: v_mul_hi_u32 v12, v4, v7
-; GCN-NEXT: v_mul_hi_u32 v9, v5, v6
-; GCN-NEXT: v_mul_lo_u32 v6, v5, v6
-; GCN-NEXT: v_mul_hi_u32 v8, v5, v7
-; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT: v_mul_lo_u32 v7, v5, v7
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v6
-; GCN-NEXT: v_mul_lo_u32 v7, v0, v5
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v4
-; GCN-NEXT: v_mul_hi_u32 v9, v0, v5
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GCN-NEXT: v_xor_b32_e32 v1, v1, v6
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT: v_mul_lo_u32 v9, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v10, v1, v5
-; GCN-NEXT: v_mul_lo_u32 v5, v1, v5
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v5, v3, v5
-; GCN-NEXT: v_mul_hi_u32 v7, v3, v4
-; GCN-NEXT: v_mul_lo_u32 v8, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v3, v4
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GCN-NEXT: v_sub_i32_e32 v7, vcc, v1, v5
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v2, vcc
-; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v3
-; GCN-NEXT: v_subbrev_u32_e64 v8, s[6:7], 0, v4, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v2
-; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3
-; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc
+; GCN-NEXT: v_mul_hi_u32 v11, v7, v5
+; GCN-NEXT: v_mul_lo_u32 v5, v7, v5
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v10, vcc
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v4, v5
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], 32
+; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_or_b32_e32 v4, v4, v9
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GCN-NEXT: v_xor_b32_e32 v10, v0, v8
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v6, v4
+; GCN-NEXT: v_xor_b32_e32 v9, v1, v8
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v5, vcc
+; GCN-NEXT: v_mul_lo_u32 v5, v10, v1
+; GCN-NEXT: v_mul_hi_u32 v6, v10, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v10, v1
+; GCN-NEXT: v_mul_hi_u32 v4, v9, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v9, v0
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v9, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v9, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v4, vcc
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v4, v3, v0
+; GCN-NEXT: v_mul_lo_u32 v1, v3, v1
+; GCN-NEXT: v_mul_lo_u32 v5, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v3, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, v9, v1
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v10, v0
+; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v2, vcc
+; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v0, v3
+; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v2
+; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v5, v3
; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v2
+; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v2
; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[6:7]
+; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v5, v3
+; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
+; GCN-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc
+; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
-; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7]
-; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v7, v3
-; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
-; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9
-; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v10, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v10, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v6
-; GCN-NEXT: v_xor_b32_e32 v1, v1, v6
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v8
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IR-LABEL: v_test_srem:
@@ -959,71 +971,77 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_mul_lo_u32 v2, s0, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s0, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s1, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v1, v2
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s0, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s0, v3
+; GCN-NEXT: v_mul_lo_u32 v5, s1, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s0, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v0
+; GCN-NEXT: v_mul_lo_u32 v6, v2, v4
+; GCN-NEXT: v_mul_hi_u32 v4, v2, v4
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v4, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s0, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s0, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s1, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_lo_u32 v3, s0, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_hi_u32 v1, s0, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s0, v2
+; GCN-NEXT: v_mul_lo_u32 v6, s1, v3
+; GCN-NEXT: v_mul_lo_u32 v0, s0, v3
; GCN-NEXT: s_add_u32 s0, s2, s6
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v6, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v5, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v2, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v2, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
; GCN-NEXT: s_addc_u32 s1, s3, s6
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[6:7]
-; GCN-NEXT: v_mul_lo_u32 v2, s14, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s14, v0
-; GCN-NEXT: v_mul_hi_u32 v4, s14, v1
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v3, s14, v1
+; GCN-NEXT: v_mul_hi_u32 v4, s14, v0
+; GCN-NEXT: v_mul_hi_u32 v5, s14, v1
+; GCN-NEXT: v_mul_hi_u32 v2, s15, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s15, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
; GCN-NEXT: v_mul_hi_u32 v5, s15, v1
; GCN-NEXT: v_mul_lo_u32 v1, s15, v1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, s15, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s15, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v2, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT: v_mul_lo_u32 v1, s12, v1
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
; GCN-NEXT: v_mul_hi_u32 v2, s12, v0
+; GCN-NEXT: v_mul_lo_u32 v1, s12, v1
; GCN-NEXT: v_mul_lo_u32 v3, s13, v0
; GCN-NEXT: v_mul_lo_u32 v0, s12, v0
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
@@ -1341,52 +1359,56 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s3, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v6, v1, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s2, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s2, v3
+; GCN-NEXT: v_mul_lo_u32 v5, s3, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT: v_mul_hi_u32 v8, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s3, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_lo_u32 v3, s2, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_hi_u32 v1, s2, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s2, v2
+; GCN-NEXT: v_mul_lo_u32 v6, s3, v3
+; GCN-NEXT: v_mul_lo_u32 v0, s2, v3
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v6, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v5, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v2, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v2, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GCN-NEXT: v_mul_lo_u32 v2, v1, 24
; GCN-NEXT: v_mul_hi_u32 v0, v0, 24
; GCN-NEXT: v_mul_hi_u32 v1, v1, 24
@@ -1516,100 +1538,104 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
-; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc
-; GCN-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
-; GCN-NEXT: v_rcp_f32_e32 v2, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; GCN-NEXT: v_trunc_f32_e32 v3, v3
-; GCN-NEXT: v_madmk_f32 v2, v3, 0xcf800000, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_mul_hi_u32 v6, v4, v2
-; GCN-NEXT: v_mul_lo_u32 v7, v4, v3
-; GCN-NEXT: v_mul_lo_u32 v8, v5, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GCN-NEXT: v_mul_lo_u32 v7, v4, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; GCN-NEXT: v_mul_lo_u32 v8, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v9, v2, v7
-; GCN-NEXT: v_mul_hi_u32 v10, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v11, v3, v6
-; GCN-NEXT: v_mul_lo_u32 v6, v3, v6
+; GCN-NEXT: v_xor_b32_e32 v3, v1, v2
+; GCN-NEXT: v_xor_b32_e32 v2, v0, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v3
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
+; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
+; GCN-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT: v_trunc_f32_e32 v1, v1
+; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v4, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v5, v1
+; GCN-NEXT: v_mul_hi_u32 v0, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v1, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v8, v7, v4
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v6, v4
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_mul_lo_u32 v8, v4, v0
+; GCN-NEXT: v_mul_hi_u32 v9, v4, v1
+; GCN-NEXT: v_mul_hi_u32 v10, v4, v0
+; GCN-NEXT: v_mul_hi_u32 v11, v5, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v5, v0
; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT: v_mul_lo_u32 v10, v3, v7
-; GCN-NEXT: v_mul_hi_u32 v7, v3, v7
+; GCN-NEXT: v_mul_lo_u32 v10, v5, v1
+; GCN-NEXT: v_mul_hi_u32 v1, v5, v1
; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v9, v1, vcc
; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v8, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v9
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v6, v4
+; GCN-NEXT: v_mul_hi_u32 v1, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v6, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v7, v7, v4
+; GCN-NEXT: v_mul_hi_u32 v8, v5, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v7
+; GCN-NEXT: v_mul_lo_u32 v6, v4, v1
+; GCN-NEXT: v_mul_hi_u32 v7, v4, v0
+; GCN-NEXT: v_mul_hi_u32 v9, v4, v1
+; GCN-NEXT: v_mul_lo_u32 v0, v5, v0
; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v4, v3
-; GCN-NEXT: v_mul_hi_u32 v7, v4, v2
-; GCN-NEXT: v_mul_lo_u32 v5, v5, v2
-; GCN-NEXT: v_mul_lo_u32 v4, v4, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT: v_mul_lo_u32 v8, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v9, v2, v4
-; GCN-NEXT: v_mul_hi_u32 v10, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v7, v3, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v3, v4
-; GCN-NEXT: v_mul_hi_u32 v6, v3, v5
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT: v_mul_lo_u32 v5, v3, v5
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, v3, 24
-; GCN-NEXT: v_mul_hi_u32 v2, v2, 24
-; GCN-NEXT: v_mul_hi_u32 v3, v3, 24
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v2
-; GCN-NEXT: v_mul_hi_u32 v4, v0, v2
-; GCN-NEXT: v_mul_lo_u32 v2, v0, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2
-; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
-; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v0
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc
+; GCN-NEXT: v_mul_hi_u32 v9, v5, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v5, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v7, v8, vcc
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v7
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v4, v1, 24
+; GCN-NEXT: v_mul_hi_u32 v0, v0, 24
+; GCN-NEXT: v_mul_hi_u32 v1, v1, 24
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v1, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v4, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, vcc
+; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v0, v2
; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v1
+; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v3
; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v5, v0
+; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v5, v2
; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v1
-; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v3
+; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[6:7]
-; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0
-; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v2
+; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc
; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7
; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5]
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IR-LABEL: v_test_srem_k_num_i64:
@@ -1711,96 +1737,101 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
-; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc
-; GCN-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
-; GCN-NEXT: v_rcp_f32_e32 v2, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; GCN-NEXT: v_trunc_f32_e32 v3, v3
-; GCN-NEXT: v_madmk_f32 v2, v3, 0xcf800000, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_mul_hi_u32 v6, v4, v2
-; GCN-NEXT: v_mul_lo_u32 v7, v4, v3
-; GCN-NEXT: v_mul_lo_u32 v8, v5, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GCN-NEXT: v_mul_lo_u32 v7, v4, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; GCN-NEXT: v_mul_lo_u32 v8, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v9, v2, v7
-; GCN-NEXT: v_mul_hi_u32 v10, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v11, v3, v6
-; GCN-NEXT: v_mul_lo_u32 v6, v3, v6
+; GCN-NEXT: v_xor_b32_e32 v3, v1, v2
+; GCN-NEXT: v_xor_b32_e32 v2, v0, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v3
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
+; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
+; GCN-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-NEXT: s_mov_b32 s4, 0x8000
+; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT: v_trunc_f32_e32 v1, v1
+; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v4, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v5, v1
+; GCN-NEXT: v_mul_hi_u32 v0, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v1, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v8, v7, v4
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v6, v4
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_mul_lo_u32 v8, v4, v0
+; GCN-NEXT: v_mul_hi_u32 v9, v4, v1
+; GCN-NEXT: v_mul_hi_u32 v10, v4, v0
+; GCN-NEXT: v_mul_hi_u32 v11, v5, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v5, v0
; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT: v_mul_lo_u32 v10, v3, v7
-; GCN-NEXT: v_mul_hi_u32 v7, v3, v7
+; GCN-NEXT: v_mul_lo_u32 v10, v5, v1
+; GCN-NEXT: v_mul_hi_u32 v1, v5, v1
; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v9, v1, vcc
; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v8, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v9
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v6, v4
+; GCN-NEXT: v_mul_hi_u32 v1, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v6, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v7, v7, v4
+; GCN-NEXT: v_mul_hi_u32 v8, v5, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v7
+; GCN-NEXT: v_mul_lo_u32 v6, v4, v1
+; GCN-NEXT: v_mul_hi_u32 v7, v4, v0
+; GCN-NEXT: v_mul_hi_u32 v9, v4, v1
+; GCN-NEXT: v_mul_lo_u32 v0, v5, v0
; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v4, v3
-; GCN-NEXT: v_mul_hi_u32 v7, v4, v2
-; GCN-NEXT: v_mul_lo_u32 v5, v5, v2
-; GCN-NEXT: v_mul_lo_u32 v4, v4, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT: v_mul_lo_u32 v8, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v9, v2, v4
-; GCN-NEXT: v_mul_hi_u32 v10, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v7, v3, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v3, v4
-; GCN-NEXT: v_mul_hi_u32 v6, v3, v5
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT: v_mul_lo_u32 v5, v3, v5
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v5, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v2
-; GCN-NEXT: v_mul_hi_u32 v4, v0, v2
-; GCN-NEXT: v_mul_lo_u32 v2, v0, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0x8000, v2
-; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
-; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v0
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc
+; GCN-NEXT: v_mul_hi_u32 v9, v5, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v5, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v7, v8, vcc
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v7
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v1, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 17, v0
+; GCN-NEXT: v_mul_lo_u32 v1, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v4, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, vcc
+; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v0, v2
; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v1
+; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v3
; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v5, v0
+; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v5, v2
; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7]
-; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v1
-; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v3
+; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[6:7]
-; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0
-; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v2
+; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc
; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7
; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5]
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IR-LABEL: v_test_srem_pow2_k_num_i64:
@@ -1905,7 +1936,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2
; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff8000, v2
+; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], 15
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 15
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -2243,7 +2275,8 @@ define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) {
; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v1
; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff8000, v2
+; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], 15
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 15
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 71017f15e3c6d1..d3d1bb32a30222 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -28,8 +28,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
; GFX9-LABEL: v_ssubsat_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 8
+; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp
; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -37,6 +38,8 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
; GFX10PLUS-LABEL: v_ssubsat_i8:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
@@ -142,7 +145,7 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/store-private.ll b/llvm/test/CodeGen/AMDGPU/store-private.ll
index 8e2d464bad2ddf..744939f6485f01 100644
--- a/llvm/test/CodeGen/AMDGPU/store-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-private.ll
@@ -19,8 +19,10 @@
define amdgpu_kernel void @store_i1(ptr addrspace(5) %out) {
; EG-LABEL: store_i1:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 15, @0, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU 17, @0, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
@@ -40,8 +42,10 @@ define amdgpu_kernel void @store_i1(ptr addrspace(5) %out) {
;
; CM-LABEL: store_i1:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 15, @0, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU 17, @0, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR * T0.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
@@ -91,8 +95,10 @@ entry:
define amdgpu_kernel void @store_i8(ptr addrspace(5) %out, i8 %in) {
; EG-LABEL: store_i8:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 4, @1, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU 6, @1, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
@@ -116,8 +122,10 @@ define amdgpu_kernel void @store_i8(ptr addrspace(5) %out, i8 %in) {
;
; CM-LABEL: store_i8:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 4, @1, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU 6, @1, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR * T0.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; CM-NEXT: MOV * T0.X, T(0 + AR.x).X+,
@@ -168,8 +176,10 @@ entry:
define amdgpu_kernel void @store_i16(ptr addrspace(5) %out, i16 %in) {
; EG-LABEL: store_i16:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 4, @3, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU 6, @3, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
@@ -194,8 +204,10 @@ define amdgpu_kernel void @store_i16(ptr addrspace(5) %out, i16 %in) {
;
; CM-LABEL: store_i16:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 4, @3, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU 6, @3, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR * T0.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; CM-NEXT: MOV * T0.X, T(0 + AR.x).X+,
@@ -250,8 +262,10 @@ entry:
define amdgpu_kernel void @store_i24(ptr addrspace(5) %out, i24 %in) {
; EG-LABEL: store_i24:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 37, @5, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU 41, @5, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
@@ -259,42 +273,46 @@ define amdgpu_kernel void @store_i24(ptr addrspace(5) %out, i24 %in) {
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T2.W, literal.x, PV.W,
-; EG-NEXT: AND_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT T3.W, KC0[2].Z, literal.y,
; EG-NEXT: NOT_INT * T2.W, PV.W,
-; EG-NEXT: AND_INT T0.Z, T0.X, PV.W,
-; EG-NEXT: LSHL T1.W, T3.W, T1.W,
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT: AND_INT T1.Z, T0.X, PS,
+; EG-NEXT: LSHL T1.W, PV.W, T1.W,
+; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T1.W, literal.x, PV.W,
-; EG-NEXT: MOV * T2.W, literal.y,
+; EG-NEXT: MOV * T3.W, literal.y,
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT: BFE_UINT T2.W, KC0[2].Z, literal.x, PS,
+; EG-NEXT: BFE_UINT T3.W, KC0[2].Z, literal.x, PS,
; EG-NEXT: NOT_INT * T1.W, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, T0.X, PS,
; EG-NEXT: LSHL * T0.W, PV.W, T0.W,
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; EG-NEXT: RETURN
;
; CM-LABEL: store_i24:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 38, @5, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU 41, @5, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR * T0.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
@@ -304,21 +322,22 @@ define amdgpu_kernel void @store_i24(ptr addrspace(5) %out, i24 %in) {
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, KC0[2].Z, literal.x,
+; CM-NEXT: ADD_INT T0.Y, KC0[2].Y, literal.x,
+; CM-NEXT: AND_INT T0.Z, KC0[2].Z, literal.y,
; CM-NEXT: NOT_INT * T2.W, PV.W,
-; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Y, T0.X, PV.W,
+; CM-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
+; CM-NEXT: AND_INT T1.Y, T0.X, PV.W,
; CM-NEXT: LSHL T0.Z, PV.Z, T1.W,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
-; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT * T1.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; CM-NEXT: AND_INT * T0.W, T0.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
@@ -394,8 +413,10 @@ entry:
define amdgpu_kernel void @store_v2i8(ptr addrspace(5) %out, <2 x i32> %in) {
; EG-LABEL: store_v2i8:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 21, @7, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU 23, @7, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
@@ -421,8 +442,10 @@ define amdgpu_kernel void @store_v2i8(ptr addrspace(5) %out, <2 x i32> %in) {
;
; CM-LABEL: store_v2i8:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 21, @7, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU 23, @7, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR * T0.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
@@ -478,39 +501,55 @@ entry:
define amdgpu_kernel void @store_v2i8_unaligned(ptr addrspace(5) %out, <2 x i32> %in) {
; EG-LABEL: store_v2i8_unaligned:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 34, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU 50, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: MOV T0.Z, T2.X, BS:VEC_120/SCL_212
+; EG-NEXT: AND_INT * T1.W, KC0[3].X, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, PV.Z, literal.x,
+; EG-NEXT: LSHL T1.W, PV.W, literal.y,
+; EG-NEXT: AND_INT * T2.W, KC0[2].Y, literal.z,
+; EG-NEXT: -65536(nan), 8(1.121039e-44)
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T2.W, literal.x, PV.W,
-; EG-NEXT: AND_INT * T3.W, KC0[2].W, literal.x,
+; EG-NEXT: LSHL T1.Z, PS, literal.x,
+; EG-NEXT: OR_INT T1.W, PV.Z, PV.W,
+; EG-NEXT: AND_INT * T2.W, KC0[2].W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; EG-NEXT: OR_INT T1.W, PV.W, PS,
+; EG-NEXT: LSHL * T2.W, literal.x, PV.Z,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: NOT_INT * T2.W, PV.W,
-; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
-; EG-NEXT: LSHL T1.W, T3.W, T1.W,
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, 1,
+; EG-NEXT: ADD_INT T0.Z, KC0[2].Y, 1,
+; EG-NEXT: NOT_INT T2.W, PS,
+; EG-NEXT: AND_INT * T3.W, PV.W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.Z, PS, T1.Z,
+; EG-NEXT: AND_INT T2.W, T0.Y, PV.W,
+; EG-NEXT: AND_INT * T3.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
; EG-NEXT: LSHR T3.W, PS, literal.x,
-; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: OR_INT * T2.W, PV.W, PV.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
-; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T2.W,
; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T1.W, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, KC0[3].X, literal.x,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: NOT_INT * T1.W, PV.W,
-; EG-NEXT: AND_INT T1.W, T0.Y, PV.W,
-; EG-NEXT: LSHL * T0.W, T2.W, T0.W,
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T1.W, T1.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 65280(9.147676e-41)
+; EG-NEXT: LSHR T1.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T0.Y, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, T0.W,
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
@@ -518,43 +557,57 @@ define amdgpu_kernel void @store_v2i8_unaligned(ptr addrspace(5) %out, <2 x i32>
;
; CM-LABEL: store_v2i8_unaligned:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 36, @8, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU 50, @8, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR * T0.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: MOV T0.Z, T2.X, BS:VEC_120/SCL_212
+; CM-NEXT: AND_INT * T1.W, KC0[3].X, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Y, PV.Z, literal.x,
+; CM-NEXT: LSHL T0.Z, PV.W, literal.y,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT: -65536(nan), 8(1.121039e-44)
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: LSHL T2.Y, PV.W, literal.x,
+; CM-NEXT: OR_INT T0.Z, PV.Y, PV.Z,
+; CM-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
+; CM-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
+; CM-NEXT: LSHL * T1.W, literal.x, PV.Y,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
-; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, 1,
+; CM-NEXT: NOT_INT T1.Z, PV.W,
+; CM-NEXT: AND_INT * T1.W, PV.Z, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
-; CM-NEXT: LSHL T0.Z, PV.Z, T1.W,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, 1,
-; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: LSHL T2.Y, PV.W, T2.Y,
+; CM-NEXT: AND_INT T1.Z, T0.Y, PV.Z,
+; CM-NEXT: AND_INT * T1.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.Y,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
-; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; CM-NEXT: AND_INT * T0.W, T1.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, KC0[3].X, literal.x,
-; CM-NEXT: NOT_INT * T1.W, PV.W,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T2.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: AND_INT * T1.W, T0.Z, literal.y,
+; CM-NEXT: 255(3.573311e-43), 65280(9.147676e-41)
+; CM-NEXT: LSHR T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.Z,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.Y, PV.W,
; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; CM-NEXT: RETURN
entry:
@@ -635,8 +688,10 @@ entry:
define amdgpu_kernel void @store_v2i16_unaligned(ptr addrspace(5) %out, <2 x i32> %in) {
; EG-LABEL: store_v2i16_unaligned:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 35, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU 41, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
@@ -644,40 +699,46 @@ define amdgpu_kernel void @store_v2i16_unaligned(ptr addrspace(5) %out, <2 x i32
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T2.W, literal.x, PV.W,
-; EG-NEXT: AND_INT * T3.W, KC0[2].W, literal.x,
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT T3.W, KC0[2].W, literal.y,
; EG-NEXT: NOT_INT * T2.W, PV.W,
-; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
-; EG-NEXT: LSHL T1.W, T3.W, T1.W,
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT: AND_INT T1.Z, T0.Y, PS,
+; EG-NEXT: LSHL T1.W, PV.W, T1.W,
+; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T1.W, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, KC0[3].X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T3.W, KC0[3].X, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
; EG-NEXT: NOT_INT * T1.W, PV.W,
-; EG-NEXT: AND_INT T1.W, T0.Y, PV.W,
-; EG-NEXT: LSHL * T0.W, T2.W, T0.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, T0.Y, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, T0.W,
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; EG-NEXT: RETURN
;
; CM-LABEL: store_v2i16_unaligned:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU 41, @10, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR * T0.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
@@ -687,30 +748,32 @@ define amdgpu_kernel void @store_v2i16_unaligned(ptr addrspace(5) %out, <2 x i32
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.y,
; CM-NEXT: NOT_INT * T2.W, PV.W,
-; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
; CM-NEXT: LSHL T0.Z, PV.Z, T1.W,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
-; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT * T1.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; CM-NEXT: AND_INT * T0.W, T1.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
-; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, KC0[3].X, literal.x,
-; CM-NEXT: NOT_INT * T1.W, PV.W,
-; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: AND_INT T2.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL T0.Z, literal.x, PV.W,
+; CM-NEXT: LSHL * T1.W, KC0[3].X, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.Z,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, T0.Y, PV.W,
; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
@@ -839,9 +902,11 @@ entry:
define amdgpu_kernel void @store_v4i8_unaligned(ptr addrspace(5) %out, <4 x i32> %in) {
; EG-LABEL: store_v4i8_unaligned:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 81, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 92, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T1.W, PV.W, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
; EG-NEXT: LSHR * T1.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
@@ -851,74 +916,83 @@ define amdgpu_kernel void @store_v4i8_unaligned(ptr addrspace(5) %out, <4 x i32>
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T2.W, literal.x, PV.W,
-; EG-NEXT: AND_INT * T3.W, KC0[4].X, literal.x,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: LSHL * T3.W, KC0[4].X, literal.y,
+; EG-NEXT: 255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T4.W, PS, literal.y,
; EG-NEXT: NOT_INT * T2.W, PV.W,
-; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
-; EG-NEXT: LSHL T0.W, T3.W, T0.W,
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
+; EG-NEXT: AND_INT T1.Z, T0.Y, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T0.W,
+; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[3].W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T1.W, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, KC0[3].W, literal.x,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: NOT_INT * T1.W, PV.W,
-; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
-; EG-NEXT: LSHL T0.W, T2.W, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, 1,
+; EG-NEXT: LSHL T4.W, literal.x, PV.W,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; EG-NEXT: ADD_INT T0.Z, KC0[2].Y, 1,
+; EG-NEXT: LSHR T5.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T4.W, PV.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T0.W,
+; EG-NEXT: AND_INT * T4.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
; EG-NEXT: LSHR T4.W, PS, literal.x,
; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: AND_INT * T2.W, KC0[3].Z, literal.y,
+; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T1.W, literal.x, PV.W,
-; EG-NEXT: AND_INT * T3.W, KC0[3].Z, literal.x,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: NOT_INT * T1.W, PV.W,
-; EG-NEXT: AND_INT T1.W, T0.Y, PV.W,
-; EG-NEXT: LSHL * T0.W, T3.W, T0.W,
-; EG-NEXT: LSHR T5.W, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: LSHL T5.W, literal.x, PV.W,
+; EG-NEXT: LSHL * T2.W, T2.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: LSHR T6.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T5.W, PV.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T0.Y, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T0.W,
+; EG-NEXT: AND_INT * T5.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T5.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: LSHL T0.W, T3.W, literal.x,
-; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.y,
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T0.Z, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT T0.W, PS, PV.W,
-; EG-NEXT: LSHL * T1.W, T2.W, literal.y,
-; EG-NEXT: 3(4.203895e-45), 16(2.242078e-44)
-; EG-NEXT: OR_INT T1.Z, PV.W, PS,
-; EG-NEXT: AND_INT T0.W, KC0[3].Y, literal.x,
-; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T2.W, T3.W, T2.W,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.Z, PS, T1.W,
+; EG-NEXT: AND_INT T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T2.W, literal.x, PS,
-; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: AND_INT T1.W, PS, literal.x,
; EG-NEXT: NOT_INT * T2.W, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, T0.Y, PS,
-; EG-NEXT: LSHL * T0.W, PV.W, T1.W,
+; EG-NEXT: LSHL * T0.W, PV.W, T0.W,
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
@@ -926,9 +1000,11 @@ define amdgpu_kernel void @store_v4i8_unaligned(ptr addrspace(5) %out, <4 x i32>
;
; CM-LABEL: store_v4i8_unaligned:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 84, @12, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 92, @12, KC0[CB0:0-32], KC1[]
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
@@ -937,80 +1013,86 @@ define amdgpu_kernel void @store_v4i8_unaligned(ptr addrspace(5) %out, <4 x i32>
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, KC0[4].X, literal.x,
-; CM-NEXT: NOT_INT * T2.W, PV.W,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: LSHL T0.Z, literal.x, PV.W,
+; CM-NEXT: LSHL * T2.W, KC0[4].X, literal.y,
+; CM-NEXT: 255(3.573311e-43), 24(3.363116e-44)
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR T1.Z, PV.W, literal.y,
+; CM-NEXT: NOT_INT * T3.W, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
; CM-NEXT: LSHL T0.Z, PV.Z, T0.W,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT * T0.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: AND_INT * T0.W, T1.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
-; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, KC0[3].W, literal.x,
-; CM-NEXT: NOT_INT * T1.W, PV.W,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
+; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; CM-NEXT: LSHL T2.Z, literal.x, PV.W,
+; CM-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, 1,
+; CM-NEXT: LSHR T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T3.W, PV.Z,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
-; CM-NEXT: LSHL T2.Z, PV.Z, T0.W,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, 1,
-; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: LSHL T0.Z, PV.Z, T0.W,
+; CM-NEXT: AND_INT * T0.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
-; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
-; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: AND_INT * T0.W, T1.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T1.Z, KC0[3].Z, literal.x,
-; CM-NEXT: NOT_INT * T1.W, PV.W,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T2.Z, T0.Y, PV.W,
-; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
-; CM-NEXT: LSHR T4.Z, KC0[2].Y, literal.x,
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: AND_INT T0.Z, KC0[3].Z, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
+; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: LSHL * T3.W, PV.Z, literal.y,
+; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; CM-NEXT: LSHR T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T4.W, PV.Z,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T0.W,
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
-; CM-NEXT: MOVA_INT * AR.x (MASKED), T4.Z,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: LSHL T1.Z, T1.Z, literal.x,
-; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.y,
-; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; CM-NEXT: AND_INT T1.Y, KC0[2].Y, literal.x,
-; CM-NEXT: OR_INT T1.Z, PV.W, PV.Z,
-; CM-NEXT: LSHL * T0.W, T0.Z, literal.y,
-; CM-NEXT: 3(4.203895e-45), 16(2.242078e-44)
-; CM-NEXT: OR_INT T2.Y, PV.Z, PV.W,
-; CM-NEXT: AND_INT T0.Z, KC0[3].Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.Y, literal.y,
+; CM-NEXT: AND_INT T0.Z, KC0[2].Y, literal.x,
+; CM-NEXT: OR_INT * T0.W, T2.W, T3.W,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: OR_INT T1.Y, PV.W, T1.W,
+; CM-NEXT: AND_INT T2.Z, KC0[3].Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.Z, literal.y,
; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
-; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: LSHL T0.Z, literal.x, PV.W,
; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
+; CM-NEXT: AND_INT T2.Z, PV.W, literal.x,
; CM-NEXT: NOT_INT * T1.W, PV.Z,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; CM-NEXT: AND_INT T0.Z, T0.Y, PV.W,
; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOVA_INT * AR.x (MASKED), T4.Z,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; CM-NEXT: RETURN
entry:
@@ -1126,16 +1208,20 @@ entry:
define amdgpu_kernel void @store_v8i8_unaligned(ptr addrspace(5) %out, <8 x i32> %in) {
; EG-LABEL: store_v8i8_unaligned:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 106, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, KC0[5].X, literal.y,
+; EG-NEXT: ALU 103, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: AND_INT * T0.W, KC0[5].X, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, KC0[6].X, literal.y,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: AND_INT * T0.W, KC0[6].X, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.W,
@@ -1171,187 +1257,206 @@ define amdgpu_kernel void @store_v8i8_unaligned(ptr addrspace(5) %out, <8 x i32>
; EG-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T3.X,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.Z, PV.Y, literal.x,
-; EG-NEXT: AND_INT T0.W, KC0[4].Y, literal.y,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: AND_INT T1.W, KC0[4].Y, literal.y,
+; EG-NEXT: AND_INT * T2.W, PV.W, literal.z,
; EG-NEXT: -256(nan), 255(3.573311e-43)
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: -4(nan), 0(0.000000e+00)
; EG-NEXT: LSHR T2.W, PS, literal.x,
-; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV * T3.X, PS,
; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
; EG-NEXT: MOV T0.Z, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T1.W, T1.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T3.W, literal.x, PV.W,
-; EG-NEXT: LSHR * T4.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT: LSHL * T3.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T4.W, T0.Y, literal.y,
; EG-NEXT: NOT_INT * T3.W, PV.W,
-; EG-NEXT: AND_INT T0.Z, T0.Z, PV.W,
-; EG-NEXT: LSHL T1.W, T4.W, T1.W,
-; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T4.W, PS, literal.x,
-; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
+; EG-NEXT: AND_INT T0.Z, T0.Z, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T0.W,
+; EG-NEXT: AND_INT * T3.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV * T0.Z, T2.X,
; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
-; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
-; EG-NEXT: MOV T1.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T1.W, T3.W, literal.x,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.W, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T2.W, T1.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T2.W, PS, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T2.W, literal.x, PV.W,
-; EG-NEXT: MOV * T3.W, literal.y,
+; EG-NEXT: LSHL T4.W, literal.x, PV.W,
+; EG-NEXT: MOV * T5.W, literal.y,
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT: BFE_UINT T5.W, T0.Y, literal.x, PS,
-; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: ADD_INT T1.Z, KC0[2].Y, 1,
+; EG-NEXT: BFE_UINT T6.W, T0.Y, literal.x, PS,
+; EG-NEXT: NOT_INT * T4.W, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.Z, T1.Y, PS,
-; EG-NEXT: LSHL T1.W, PV.W, T1.W,
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, 1,
-; EG-NEXT: LSHR T5.W, PS, literal.x,
-; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: AND_INT T2.Z, T0.W, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T2.W,
+; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
-; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
-; EG-NEXT: MOV T1.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T1.W, T2.W, literal.x,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: ALU 110, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV T0.W, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T3.W, T1.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T3.W, PS, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
+; EG-NEXT: LSHL * T4.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T4.W, T0.Y, literal.x, T3.W,
-; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: BFE_UINT T6.W, T0.Y, literal.x, T5.W,
+; EG-NEXT: NOT_INT * T4.W, PV.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, T1.Y, PS,
-; EG-NEXT: LSHL * T1.W, PV.W, T1.W,
-; EG-NEXT: ALU 102, @14, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR T4.W, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T1.W, T2.W, T1.W,
+; EG-NEXT: AND_INT T1.Z, T0.W, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T3.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
-; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T2.W, literal.x, PV.W,
-; EG-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT T1.W, T1.W, literal.y,
; EG-NEXT: NOT_INT * T2.W, PV.W,
-; EG-NEXT: AND_INT T1.Z, T0.Y, PV.W,
-; EG-NEXT: LSHL T0.W, T0.W, T1.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T2.W, PS, literal.x,
+; EG-NEXT: 7(9.809089e-45), 255(3.573311e-43)
+; EG-NEXT: AND_INT T2.Z, T0.Y, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T0.W,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.W, PS, literal.x,
; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T1.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T1.W, literal.x, PV.W,
-; EG-NEXT: LSHR * T4.W, T0.Z, literal.y,
-; EG-NEXT: 255(3.573311e-43), 24(3.363116e-44)
-; EG-NEXT: NOT_INT * T1.W, PV.W,
-; EG-NEXT: AND_INT T1.Z, T0.Y, PV.W,
-; EG-NEXT: LSHL T0.W, T4.W, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T4.W, PS, literal.x,
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T3.W, T0.Z, literal.y,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 6(8.407791e-45), 24(3.363116e-44)
+; EG-NEXT: AND_INT T2.Z, T0.Y, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T0.W,
+; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T1.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T2.W, T0.Z, literal.x, T3.W,
+; EG-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_UINT T3.W, T0.Z, literal.y, T5.W,
; EG-NEXT: NOT_INT * T1.W, PV.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.Z, T0.Y, PS,
+; EG-NEXT: 5(7.006492e-45), 16(2.242078e-44)
+; EG-NEXT: AND_INT T2.Z, T0.Y, PS,
; EG-NEXT: LSHL T0.W, PV.W, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T2.W, PS, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.W, PS, literal.x,
; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
-; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T1.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T3.W, T0.Z, literal.x, T3.W,
-; EG-NEXT: NOT_INT * T1.W, PV.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.Z, T0.Y, PS,
+; EG-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_UINT T3.W, T0.Z, literal.y, T5.W,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT: AND_INT T2.Z, T0.Y, PS,
; EG-NEXT: LSHL T0.W, PV.W, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: AND_INT * T0.W, T1.Z, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.Z, T0.Z, literal.x,
-; EG-NEXT: AND_INT T1.W, KC0[5].Y, literal.y,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.z,
+; EG-NEXT: AND_INT * T1.W, KC0[5].Y, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
+; EG-NEXT: ALU 12, @15, KC0[], KC1[]
+; EG-NEXT: LSHL * T0.W, T0.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T2.W, literal.x, PS,
-; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: LSHL T3.W, literal.x, PV.W,
+; EG-NEXT: OR_INT * T1.W, T0.Z, T1.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PS, literal.x,
-; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: NOT_INT * T3.W, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, T0.Y, PS,
+; EG-NEXT: AND_INT T3.W, T0.Y, PS,
; EG-NEXT: LSHL * T0.W, PV.W, T0.W,
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; EG-NEXT: RETURN
;
; CM-LABEL: store_v8i8_unaligned:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 107, @13, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: ALU 103, @13, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MOV T0.Y, T3.X,
+; CM-NEXT: AND_INT * T0.W, KC0[5].X, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, KC0[5].X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
+; CM-NEXT: MOV T0.Y, T2.X,
+; CM-NEXT: AND_INT * T0.W, KC0[6].X, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, KC0[6].X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
@@ -1387,12 +1492,14 @@ define amdgpu_kernel void @store_v8i8_unaligned(ptr addrspace(5) %out, <8 x i32>
; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: MOV T0.Y, T3.X,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T1.Y, PV.Y, literal.x,
; CM-NEXT: AND_INT T0.Z, KC0[4].Y, literal.y,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: AND_INT * T1.W, PV.W, literal.z,
; CM-NEXT: -256(nan), 255(3.573311e-43)
-; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -1405,149 +1512,159 @@ define amdgpu_kernel void @store_v8i8_unaligned(ptr addrspace(5) %out, <8 x i32>
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T2.Z, T0.Y, literal.x,
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR T2.Z, T0.Y, literal.y,
; CM-NEXT: NOT_INT * T2.W, PV.W,
-; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T1.Y, T0.Z, PV.W,
+; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
+; CM-NEXT: AND_INT T2.Y, T0.Z, PV.W,
; CM-NEXT: LSHL T0.Z, PV.Z, T0.W,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT * T0.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T0.Z, T2.X,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
-; CM-NEXT: MOV T1.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: MOV * T0.W, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T2.W, T1.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: LSHL * T2.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
-; CM-NEXT: MOV * T2.W, literal.y,
+; CM-NEXT: MOV * T3.W, literal.y,
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, 1,
; CM-NEXT: BFE_UINT T3.Z, T0.Y, literal.x, PV.W,
-; CM-NEXT: NOT_INT * T3.W, PV.Z,
+; CM-NEXT: NOT_INT * T4.W, PV.Z,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T1.Y, T1.Y, PV.W,
-; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, 1,
+; CM-NEXT: AND_INT T2.Y, T0.W, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T2.W,
+; CM-NEXT: AND_INT * T0.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T3.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T3.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: ALU 110, @14, KC0[CB0:0-32], KC1[]
; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
-; CM-NEXT: MOV T1.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: MOV * T0.W, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T2.W, T1.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: LSHL * T2.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T3.W, literal.x, PV.W,
+; CM-NEXT: LSHL * T4.W, literal.x, PV.W,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: BFE_UINT T1.Z, T0.Y, literal.x, T2.W,
-; CM-NEXT: NOT_INT * T3.W, PV.W,
+; CM-NEXT: BFE_UINT T1.Z, T0.Y, literal.x, T3.W,
+; CM-NEXT: NOT_INT * T4.W, PV.W,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T2.Z, T1.Y, PV.W,
-; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
-; CM-NEXT: ALU 104, @14, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR T1.Z, KC0[2].Y, literal.x,
-; CM-NEXT: OR_INT * T0.W, T2.Z, T0.W,
+; CM-NEXT: AND_INT T0.Y, T0.W, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T2.W,
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
-; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: LSHL * T3.W, literal.x, PV.W,
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T2.Z, T1.W, literal.x,
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: AND_INT T1.Z, T1.W, literal.y,
; CM-NEXT: NOT_INT * T1.W, PV.W,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: 7(9.809089e-45), 255(3.573311e-43)
; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
-; CM-NEXT: LSHL T2.Z, PV.Z, T0.W,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT: 7(9.809089e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
+; CM-NEXT: AND_INT * T0.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: AND_INT * T0.W, T1.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR T1.Z, T0.Z, literal.x,
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR T1.Z, T0.Z, literal.y,
; CM-NEXT: NOT_INT * T1.W, PV.W,
-; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: 6(8.407791e-45), 24(3.363116e-44)
; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT * T0.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: AND_INT * T0.W, T1.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: BFE_UINT T1.Z, T0.Z, literal.x, T2.W,
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: BFE_UINT T1.Z, T0.Z, literal.y, T3.W,
; CM-NEXT: NOT_INT * T1.W, PV.W,
-; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: 5(7.006492e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT: 5(7.006492e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT * T0.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: AND_INT * T0.W, T1.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: BFE_UINT T1.Z, T0.Z, literal.x, T2.W,
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: BFE_UINT T1.Z, T0.Z, literal.y, T3.W,
; CM-NEXT: NOT_INT * T1.W, PV.W,
-; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: 4(5.605194e-45), 8(1.121039e-44)
; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
-; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT * T0.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: AND_INT * T0.W, T1.Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T1.Y, T0.Z, literal.x,
-; CM-NEXT: AND_INT T0.Z, KC0[5].Y, literal.y,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.z,
+; CM-NEXT: AND_INT * T0.Z, KC0[5].Y, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
+; CM-NEXT: ALU 12, @15, KC0[], KC1[]
+; CM-NEXT: LSHL * T0.W, T0.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
-; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T1.W, T1.Y, T0.Z,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
; CM-NEXT: NOT_INT * T1.W, PV.Z,
@@ -1593,8 +1710,10 @@ entry:
define amdgpu_kernel void @store_v4i8_halfaligned(ptr addrspace(5) %out, <4 x i32> %in) {
; EG-LABEL: store_v4i8_halfaligned:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 46, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU 50, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
@@ -1608,20 +1727,22 @@ define amdgpu_kernel void @store_v4i8_halfaligned(ptr addrspace(5) %out, <4 x i3
; EG-NEXT: AND_INT * T3.W, KC0[3].Y, literal.z,
; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
; EG-NEXT: OR_INT T3.W, PV.W, PS,
; EG-NEXT: NOT_INT * T4.W, PV.Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.Z, T0.Y, PS,
; EG-NEXT: LSHL T1.W, PV.W, T1.W,
-; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T4.W, PS, literal.x,
+; EG-NEXT: AND_INT * T3.W, PV.Z, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
-; EG-NEXT: AND_INT T0.Z, T3.W, literal.x,
+; EG-NEXT: AND_INT T0.Z, T1.Z, literal.x,
; EG-NEXT: AND_INT T0.W, KC0[3].W, literal.y,
; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.z,
; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
@@ -1639,14 +1760,16 @@ define amdgpu_kernel void @store_v4i8_halfaligned(ptr addrspace(5) %out, <4 x i3
; EG-NEXT: AND_INT T2.W, T0.Y, PS,
; EG-NEXT: LSHL * T0.W, PV.W, T1.W,
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
; EG-NEXT: RETURN
;
; CM-LABEL: store_v4i8_halfaligned:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 46, @15, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU 50, @16, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR * T0.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
@@ -1660,20 +1783,22 @@ define amdgpu_kernel void @store_v4i8_halfaligned(ptr addrspace(5) %out, <4 x i3
; CM-NEXT: AND_INT * T2.W, KC0[3].Y, literal.z,
; CM-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: ADD_INT T2.Y, KC0[2].Y, literal.x,
; CM-NEXT: OR_INT T1.Z, PV.Z, PV.W,
; CM-NEXT: NOT_INT * T2.W, PV.Y,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
; CM-NEXT: LSHL T1.Z, PV.Z, T1.W,
-; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
-; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT * T1.W, PV.Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
-; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
-; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
; CM-NEXT: MOV * T0.Y, T(0 + AR.x).X+,
-; CM-NEXT: AND_INT T1.Y, T1.W, literal.x,
+; CM-NEXT: AND_INT T1.Y, T2.Y, literal.x,
; CM-NEXT: AND_INT T1.Z, KC0[3].W, literal.y,
; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.z,
; CM-NEXT: 3(4.203895e-45), 255(3.573311e-43)
@@ -1713,7 +1838,7 @@ entry:
define amdgpu_kernel void @store_f32(ptr addrspace(5) %out, float %in) {
; EG-LABEL: store_f32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 4, @17, KC0[CB0:0-32], KC1[]
; EG-NEXT: LSHR T0.W, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.W, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -1723,7 +1848,7 @@ define amdgpu_kernel void @store_f32(ptr addrspace(5) %out, float %in) {
;
; CM-LABEL: store_f32:
; CM: ; %bb.0:
-; CM-NEXT: ALU 4, @16, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 4, @17, KC0[CB0:0-32], KC1[]
; CM-NEXT: LSHR T0.Z, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T0.W, KC0[2].Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -1752,10 +1877,12 @@ define amdgpu_kernel void @store_f32(ptr addrspace(5) %out, float %in) {
define amdgpu_kernel void @store_v4i16(ptr addrspace(5) %out, <4 x i32> %in) {
; EG-LABEL: store_v4i16:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 33, @17, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.y,
+; EG-NEXT: ALU 37, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T3.X, PV.W,
@@ -1765,9 +1892,11 @@ define amdgpu_kernel void @store_v4i16(ptr addrspace(5) %out, <4 x i32> %in) {
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, KC0[3].Z, literal.y,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: AND_INT * T0.W, KC0[3].Z, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV * T2.X, PV.W,
@@ -1791,10 +1920,12 @@ define amdgpu_kernel void @store_v4i16(ptr addrspace(5) %out, <4 x i32> %in) {
;
; CM-LABEL: store_v4i16:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 33, @17, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: ALU 37, @18, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MOV T0.Y, T3.X,
+; CM-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV * T3.X, PV.W,
@@ -1804,9 +1935,11 @@ define amdgpu_kernel void @store_v4i16(ptr addrspace(5) %out, <4 x i32> %in) {
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
+; CM-NEXT: MOV T0.Y, T2.X,
+; CM-NEXT: AND_INT * T0.W, KC0[3].Z, literal.x,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, KC0[3].Z, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV * T2.X, PV.W,
@@ -1853,7 +1986,7 @@ entry:
define amdgpu_kernel void @store_v2f32(ptr addrspace(5) %out, float %a, float %b) {
; EG-LABEL: store_v2f32:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 10, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 10, @19, KC0[CB0:0-32], KC1[]
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: LSHR T0.Y, KC0[2].Y, literal.x,
@@ -1869,7 +2002,7 @@ define amdgpu_kernel void @store_v2f32(ptr addrspace(5) %out, float %a, float %b
;
; CM-LABEL: store_v2f32:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 10, @18, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 10, @19, KC0[CB0:0-32], KC1[]
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
@@ -1913,7 +2046,7 @@ entry:
define amdgpu_kernel void @store_v3i32(ptr addrspace(5) %out, <3 x i32> %a) nounwind {
; EG-LABEL: store_v3i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 16, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 16, @20, KC0[CB0:0-32], KC1[]
; EG-NEXT: LSHR T0.Z, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
@@ -1935,7 +2068,7 @@ define amdgpu_kernel void @store_v3i32(ptr addrspace(5) %out, <3 x i32> %a) noun
;
; CM-LABEL: store_v3i32:
; CM: ; %bb.0:
-; CM-NEXT: ALU 16, @19, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 16, @20, KC0[CB0:0-32], KC1[]
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; CM-NEXT: MOV T0.Y, KC0[3].Y,
; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
@@ -1986,7 +2119,7 @@ define amdgpu_kernel void @store_v3i32(ptr addrspace(5) %out, <3 x i32> %a) noun
define amdgpu_kernel void @store_v4i32(ptr addrspace(5) %out, <4 x i32> %in) {
; EG-LABEL: store_v4i32:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 22, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 22, @21, KC0[CB0:0-32], KC1[]
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
@@ -2014,7 +2147,7 @@ define amdgpu_kernel void @store_v4i32(ptr addrspace(5) %out, <4 x i32> %in) {
;
; CM-LABEL: store_v4i32:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 22, @20, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 22, @21, KC0[CB0:0-32], KC1[]
; CM-NEXT: LSHR T0.Y, KC0[2].Y, literal.x,
; CM-NEXT: MOV T0.Z, KC0[3].Y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
@@ -2072,7 +2205,7 @@ entry:
define amdgpu_kernel void @store_v4i32_unaligned(ptr addrspace(5) %out, <4 x i32> %in) {
; EG-LABEL: store_v4i32_unaligned:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 22, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 22, @22, KC0[CB0:0-32], KC1[]
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
@@ -2100,7 +2233,7 @@ define amdgpu_kernel void @store_v4i32_unaligned(ptr addrspace(5) %out, <4 x i32
;
; CM-LABEL: store_v4i32_unaligned:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 22, @21, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 22, @22, KC0[CB0:0-32], KC1[]
; CM-NEXT: LSHR T0.Y, KC0[2].Y, literal.x,
; CM-NEXT: MOV T0.Z, KC0[3].Y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
@@ -2159,7 +2292,7 @@ entry:
define amdgpu_kernel void @store_v4f32(ptr addrspace(5) %out, ptr addrspace(5) %in) {
; EG-LABEL: store_v4f32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 34, @22, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 34, @23, KC0[CB0:0-32], KC1[]
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T0.Z, KC0[2].Y, literal.x,
@@ -2199,7 +2332,7 @@ define amdgpu_kernel void @store_v4f32(ptr addrspace(5) %out, ptr addrspace(5) %
;
; CM-LABEL: store_v4f32:
; CM: ; %bb.0:
-; CM-NEXT: ALU 34, @22, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 34, @23, KC0[CB0:0-32], KC1[]
; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 8(1.121039e-44), 12(1.681558e-44)
@@ -2256,8 +2389,10 @@ define amdgpu_kernel void @store_v4f32(ptr addrspace(5) %out, ptr addrspace(5) %
define amdgpu_kernel void @store_i64_i8(ptr addrspace(5) %out, i64 %in) {
; EG-LABEL: store_i64_i8:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 16, @23, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU 18, @24, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
@@ -2278,8 +2413,10 @@ define amdgpu_kernel void @store_i64_i8(ptr addrspace(5) %out, i64 %in) {
;
; CM-LABEL: store_i64_i8:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 17, @23, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU 19, @24, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR * T0.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
@@ -2319,8 +2456,10 @@ entry:
define amdgpu_kernel void @store_i64_i16(ptr addrspace(5) %out, i64 %in) {
; EG-LABEL: store_i64_i16:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 16, @24, KC0[CB0:0-32], KC1[]
-; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU 18, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: -4(nan), 0(0.000000e+00)
+; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
@@ -2341,8 +2480,10 @@ define amdgpu_kernel void @store_i64_i16(ptr addrspace(5) %out, i64 %in) {
;
; CM-LABEL: store_i64_i16:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 17, @24, KC0[CB0:0-32], KC1[]
-; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: ALU 19, @25, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: -4(nan), 0(0.000000e+00)
+; CM-NEXT: LSHR * T0.W, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
@@ -2389,11 +2530,11 @@ entry:
define amdgpu_kernel void @vecload2(ptr addrspace(5) nocapture %out, ptr addrspace(4) nocapture %mem) #0 {
; EG-LABEL: vecload2:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 0, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: TEX 0 @0
; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT: ALU 8, @26, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 8, @27, KC0[CB0:0-32], KC1[]
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: LSHR T0.W, PV.W, literal.x,
@@ -2407,11 +2548,11 @@ define amdgpu_kernel void @vecload2(ptr addrspace(5) nocapture %out, ptr addrspa
;
; CM-LABEL: vecload2:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 0, @25, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: TEX 0 @0
; CM-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
-; CM-NEXT: ALU 8, @26, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 8, @27, KC0[CB0:0-32], KC1[]
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; CM-NEXT: LSHR T0.Z, PV.W, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index f88aaf389ca9ae..0f49e2d4da873c 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -63,7 +63,8 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
; HAWAII-NEXT: v_mov_b32_e32 v3, s0
; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4
; HAWAII-NEXT: s_waitcnt vmcnt(0)
-; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; HAWAII-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; HAWAII-NEXT: v_bfe_u32 v0, v0, 16, 7
; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6
; HAWAII-NEXT: ds_write_b32 v1, v3
; HAWAII-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 6ec213a06999b6..2101071cb47ee0 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -513,7 +513,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index c0c56ebb166108..ba38aea743a7df 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -144,7 +144,7 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT: v_bfe_u32 v1, v2, 0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: trunc_v2i64_arg_to_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index b714fda6f1d076..ee670cd4989a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -115,6 +115,7 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index f686aad0cefc25..ff107613ccb2c1 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -2475,60 +2475,66 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0x346d900
-; SI-NEXT: s_add_u32 s4, 0x4237, s4
+; SI-NEXT: s_add_u32 s6, 0x4237, s4
+; SI-NEXT: s_addc_u32 s4, 0, 0
+; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 32
+; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: v_mov_b32_e32 v2, 0xa9000000
-; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
-; SI-NEXT: s_addc_u32 s5, 0, 0
+; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v2
+; SI-NEXT: s_mov_b32 s6, 0xfffe7960
+; SI-NEXT: v_mul_hi_u32 v3, v4, s6
; SI-NEXT: s_or_b32 s4, vcc_lo, vcc_hi
+; SI-NEXT: v_mul_lo_u32 v2, v4, s6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: s_mov_b32 s4, 0xfffe7960
-; SI-NEXT: v_mul_hi_u32 v3, v2, s4
-; SI-NEXT: v_mul_lo_u32 v4, v2, s4
-; SI-NEXT: s_addc_u32 s5, s5, 0xa7c5
-; SI-NEXT: s_mul_i32 s6, s5, 0xfffe7960
-; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
-; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3
-; SI-NEXT: v_mul_lo_u32 v5, v2, v3
-; SI-NEXT: v_mul_hi_u32 v6, v2, v4
-; SI-NEXT: v_mul_hi_u32 v7, v2, v3
-; SI-NEXT: v_mul_hi_u32 v8, s5, v3
-; SI-NEXT: v_mul_lo_u32 v3, s5, v3
+; SI-NEXT: s_addc_u32 s4, s5, 0xa7c5
+; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
+; SI-NEXT: s_mul_i32 s5, s4, 0xfffe7960
+; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v3
+; SI-NEXT: v_mul_lo_u32 v6, v4, v3
+; SI-NEXT: v_mul_hi_u32 v7, v4, v2
+; SI-NEXT: v_mul_hi_u32 v8, v4, v3
+; SI-NEXT: v_mul_hi_u32 v5, s4, v2
+; SI-NEXT: v_mul_lo_u32 v2, s4, v2
+; SI-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; SI-NEXT: v_mul_hi_u32 v8, s4, v3
+; SI-NEXT: v_mul_lo_u32 v3, s4, v3
+; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2
+; SI-NEXT: v_addc_u32_e32 v2, vcc, v7, v5, vcc
+; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; SI-NEXT: v_add_i32_e32 v6, vcc, v2, v3
+; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 32
+; SI-NEXT: v_mov_b32_e32 v5, s4
+; SI-NEXT: v_or_b32_e32 v2, v2, v6
+; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; SI-NEXT: v_mul_lo_u32 v5, v0, v3
+; SI-NEXT: v_mul_hi_u32 v6, v0, v2
+; SI-NEXT: v_mul_hi_u32 v7, v0, v3
+; SI-NEXT: v_mul_hi_u32 v4, v1, v2
+; SI-NEXT: v_mul_lo_u32 v2, v1, v2
; SI-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; SI-NEXT: v_mul_lo_u32 v7, s5, v4
-; SI-NEXT: v_mul_hi_u32 v4, s5, v4
-; SI-NEXT: s_mov_b32 s4, 0x186a0
-; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7
-; SI-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc
-; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
-; SI-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; SI-NEXT: v_mov_b32_e32 v5, s5
-; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; SI-NEXT: v_mul_lo_u32 v4, v0, v3
-; SI-NEXT: v_mul_hi_u32 v5, v0, v2
-; SI-NEXT: v_mul_hi_u32 v6, v0, v3
; SI-NEXT: v_mul_hi_u32 v7, v1, v3
; SI-NEXT: v_mul_lo_u32 v3, v1, v3
-; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; SI-NEXT: v_mul_lo_u32 v6, v1, v2
-; SI-NEXT: v_mul_hi_u32 v2, v1, v2
-; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; SI-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc
+; SI-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; SI-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc
; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
-; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; SI-NEXT: v_mul_lo_u32 v4, v3, s4
-; SI-NEXT: v_mul_hi_u32 v5, v2, s4
+; SI-NEXT: v_add_i32_e32 v5, vcc, v2, v3
+; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 32
+; SI-NEXT: s_mov_b32 s4, 0x186a0
+; SI-NEXT: v_or_b32_e32 v2, v2, v5
+; SI-NEXT: v_mul_hi_u32 v4, v2, s4
+; SI-NEXT: v_mul_lo_u32 v5, v3, s4
; SI-NEXT: v_mul_lo_u32 v6, v2, s4
-; SI-NEXT: s_mov_b32 s4, 0x1869f
-; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; SI-NEXT: v_subrev_i32_e32 v4, vcc, 0x186a0, v0
+; SI-NEXT: v_subrev_i32_e32 v4, vcc, s4, v0
; SI-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
+; SI-NEXT: s_mov_b32 s4, 0x1869f
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4
; SI-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2553,172 +2559,188 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0x346d900
-; VI-NEXT: s_add_u32 s4, 0x4237, s4
+; VI-NEXT: s_add_u32 s6, 0x4237, s4
+; VI-NEXT: s_addc_u32 s4, 0, 0
+; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 32
+; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: v_mov_b32_e32 v2, 0xa9000000
-; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v2
+; VI-NEXT: v_add_u32_e32 v7, vcc, s4, v2
; VI-NEXT: s_mov_b32 s4, 0xfffe7960
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s4, 0
-; VI-NEXT: s_addc_u32 s6, 0, 0
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, s4, 0
; VI-NEXT: s_cmp_lg_u64 vcc, 0
-; VI-NEXT: s_addc_u32 s6, s6, 0xa7c5
+; VI-NEXT: s_addc_u32 s6, s5, 0xa7c5
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
; VI-NEXT: s_mul_i32 s4, s6, 0xfffe7960
-; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
-; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v3
-; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
-; VI-NEXT: v_mul_hi_u32 v7, v6, v2
-; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v2, 0
-; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v5, 0
-; VI-NEXT: v_add_u32_e32 v2, vcc, v7, v2
-; VI-NEXT: v_addc_u32_e32 v2, vcc, v8, v3, vcc
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; VI-NEXT: v_add_u32_e32 v8, vcc, s4, v3
+; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v8, 0
+; VI-NEXT: v_mul_hi_u32 v9, v7, v2
+; VI-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v2, 0
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v8, 0
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, v9, v5
+; VI-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; VI-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; VI-NEXT: v_or_b32_e32 v2, v2, v4
; VI-NEXT: v_mov_b32_e32 v4, s6
-; VI-NEXT: v_add_u32_e32 v5, vcc, v6, v2
-; VI-NEXT: v_addc_u32_e32 v4, vcc, v4, v3, vcc
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v4, 0
-; VI-NEXT: v_mul_hi_u32 v6, v0, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2
-; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v5, 0
-; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v4, 0
-; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; VI-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; VI-NEXT: s_mov_b32 s4, 0x186a0
-; VI-NEXT: v_mul_lo_u32 v6, v5, s4
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0
+; VI-NEXT: v_add_u32_e32 v5, vcc, v7, v2
+; VI-NEXT: v_addc_u32_e32 v6, vcc, v4, v3, vcc
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0
+; VI-NEXT: v_mul_hi_u32 v7, v0, v5
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
+; VI-NEXT: s_mov_b32 s6, 0x186a0
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v6, 0
+; VI-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; VI-NEXT: v_addc_u32_e32 v4, vcc, v8, v5, vcc
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; VI-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; VI-NEXT: v_or_b32_e32 v2, v2, v4
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, s6, 0
+; VI-NEXT: v_mul_lo_u32 v6, v3, s6
; VI-NEXT: s_mov_b32 s4, 0x1869f
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v6
-; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
-; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; VI-NEXT: v_subrev_u32_e32 v2, vcc, 0x186a0, v0
-; VI-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; VI-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v4
-; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc
-; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v6
+; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
+; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; VI-NEXT: v_subrev_u32_e32 v4, vcc, s6, v0
+; VI-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4
+; VI-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; VI-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, 2, v2
+; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v2
; VI-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
-; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc
+; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; VI-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5]
-; VI-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; VI-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc
-; VI-NEXT: v_cndmask_b32_e64 v0, v4, v2, s[4:5]
-; VI-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GCN-LABEL: v_test_udiv64_mulhi_fold:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, 0x346d900
-; GCN-NEXT: s_add_u32 s4, 0x4237, s4
+; GCN-NEXT: s_add_u32 s6, 0x4237, s4
+; GCN-NEXT: s_addc_u32 s4, 0, 0
+; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 32
+; GCN-NEXT: s_or_b32 s4, s4, s6
; GCN-NEXT: v_mov_b32_e32 v2, 0xa9000000
-; GCN-NEXT: v_add_u32_e32 v6, vcc, s4, v2
+; GCN-NEXT: v_add_u32_e32 v7, vcc, s4, v2
; GCN-NEXT: s_mov_b32 s4, 0xfffe7960
-; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s4, 0
-; GCN-NEXT: s_addc_u32 s6, 0, 0
+; GCN-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, s4, 0
; GCN-NEXT: s_cmp_lg_u64 vcc, 0
-; GCN-NEXT: s_addc_u32 s6, s6, 0xa7c5
+; GCN-NEXT: s_addc_u32 s6, s5, 0xa7c5
+; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
; GCN-NEXT: s_mul_i32 s4, s6, 0xfffe7960
-; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
-; GCN-NEXT: v_add_u32_e32 v5, vcc, s4, v3
-; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
-; GCN-NEXT: v_mul_hi_u32 v7, v6, v2
-; GCN-NEXT: v_add_u32_e32 v7, vcc, v7, v3
-; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v2, 0
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v5, 0
-; GCN-NEXT: v_add_u32_e32 v2, vcc, v7, v2
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, v8, v3, vcc
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; GCN-NEXT: v_add_u32_e32 v8, vcc, s4, v3
+; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v8, 0
+; GCN-NEXT: v_mul_hi_u32 v9, v7, v2
+; GCN-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v2, 0
+; GCN-NEXT: v_add_u32_e32 v9, vcc, v9, v3
+; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v8, 0
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT: v_add_u32_e32 v5, vcc, v9, v5
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT: v_add_u32_e32 v4, vcc, v4, v2
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; GCN-NEXT: v_or_b32_e32 v2, v2, v4
; GCN-NEXT: v_mov_b32_e32 v4, s6
-; GCN-NEXT: v_add_u32_e32 v5, vcc, v6, v2
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v3, vcc
-; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v4, 0
-; GCN-NEXT: v_mul_hi_u32 v6, v0, v5
-; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v2
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v5, 0
-; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v4, 0
-; GCN-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_u32_e32 v4, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; GCN-NEXT: s_mov_b32 s4, 0x186a0
-; GCN-NEXT: v_mul_lo_u32 v6, v5, s4
-; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0
+; GCN-NEXT: v_add_u32_e32 v5, vcc, v7, v2
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, v4, v3, vcc
+; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0
+; GCN-NEXT: v_mul_hi_u32 v7, v0, v5
+; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
+; GCN-NEXT: s_mov_b32 s6, 0x186a0
+; GCN-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
+; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v6, 0
+; GCN-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT: v_add_u32_e32 v4, vcc, v4, v2
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; GCN-NEXT: v_or_b32_e32 v2, v2, v4
+; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, s6, 0
+; GCN-NEXT: v_mul_lo_u32 v6, v3, s6
; GCN-NEXT: s_mov_b32 s4, 0x1869f
-; GCN-NEXT: v_add_u32_e32 v3, vcc, v3, v6
-; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_subrev_u32_e32 v2, vcc, 0x186a0, v0
-; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2
-; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GCN-NEXT: v_add_u32_e32 v3, vcc, 2, v4
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_u32_e32 v7, vcc, 1, v4
+; GCN-NEXT: v_add_u32_e32 v5, vcc, v5, v6
+; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
+; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN-NEXT: v_subrev_u32_e32 v4, vcc, s6, v0
+; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4
+; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GCN-NEXT: v_add_u32_e32 v5, vcc, 2, v2
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc
+; GCN-NEXT: v_add_u32_e32 v7, vcc, 1, v2
; GCN-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v2, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX1030-LABEL: v_test_udiv64_mulhi_fold:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: s_mov_b32 s4, 0x346d900
-; GFX1030-NEXT: s_add_u32 s4, 0x4237, s4
-; GFX1030-NEXT: s_addc_u32 s5, 0, 0
+; GFX1030-NEXT: s_add_u32 s6, 0x4237, s4
+; GFX1030-NEXT: s_addc_u32 s4, 0, 0
+; GFX1030-NEXT: s_lshl_b64 s[4:5], s[4:5], 32
+; GFX1030-NEXT: s_or_b32 s4, s4, s6
; GFX1030-NEXT: v_add_co_u32 v2, s4, 0xa9000000, s4
; GFX1030-NEXT: s_cmp_lg_u32 s4, 0
-; GFX1030-NEXT: s_addc_u32 s5, s5, 0xa7c5
+; GFX1030-NEXT: s_addc_u32 s6, s5, 0xa7c5
; GFX1030-NEXT: v_readfirstlane_b32 s4, v2
-; GFX1030-NEXT: s_mul_i32 s6, s5, 0xfffe7960
-; GFX1030-NEXT: s_mul_hi_u32 s7, s4, 0xfffe7960
-; GFX1030-NEXT: s_mul_i32 s8, s4, 0xfffe7960
-; GFX1030-NEXT: s_sub_i32 s7, s7, s4
-; GFX1030-NEXT: s_mul_hi_u32 s9, s4, s8
-; GFX1030-NEXT: s_add_i32 s7, s7, s6
-; GFX1030-NEXT: s_mul_hi_u32 s10, s5, s8
-; GFX1030-NEXT: s_mul_i32 s6, s5, s8
-; GFX1030-NEXT: s_mul_hi_u32 s8, s4, s7
-; GFX1030-NEXT: s_mul_i32 s4, s4, s7
-; GFX1030-NEXT: s_mul_hi_u32 s11, s5, s7
-; GFX1030-NEXT: s_add_u32 s4, s9, s4
-; GFX1030-NEXT: s_addc_u32 s8, 0, s8
-; GFX1030-NEXT: s_add_u32 s4, s4, s6
-; GFX1030-NEXT: s_mul_i32 s7, s5, s7
-; GFX1030-NEXT: s_addc_u32 s4, s8, s10
-; GFX1030-NEXT: s_addc_u32 s6, s11, 0
-; GFX1030-NEXT: s_add_u32 s4, s4, s7
-; GFX1030-NEXT: s_addc_u32 s6, 0, s6
+; GFX1030-NEXT: s_mul_i32 s5, s6, 0xfffe7960
+; GFX1030-NEXT: s_mul_hi_u32 s8, s4, 0xfffe7960
+; GFX1030-NEXT: s_mul_i32 s7, s4, 0xfffe7960
+; GFX1030-NEXT: s_sub_i32 s8, s8, s4
+; GFX1030-NEXT: s_mul_hi_u32 s9, s6, s7
+; GFX1030-NEXT: s_add_i32 s8, s8, s5
+; GFX1030-NEXT: s_mul_i32 s10, s6, s7
+; GFX1030-NEXT: s_mul_hi_u32 s5, s4, s7
+; GFX1030-NEXT: s_mul_hi_u32 s7, s4, s8
+; GFX1030-NEXT: s_mul_i32 s4, s4, s8
+; GFX1030-NEXT: s_mul_hi_u32 s11, s6, s8
+; GFX1030-NEXT: s_add_u32 s4, s5, s4
+; GFX1030-NEXT: s_addc_u32 s5, 0, s7
+; GFX1030-NEXT: s_add_u32 s4, s4, s10
+; GFX1030-NEXT: s_mul_i32 s8, s6, s8
+; GFX1030-NEXT: s_addc_u32 s4, s5, s9
+; GFX1030-NEXT: s_addc_u32 s5, s11, 0
+; GFX1030-NEXT: s_add_u32 s7, s4, s8
+; GFX1030-NEXT: s_addc_u32 s4, 0, s5
+; GFX1030-NEXT: s_lshl_b64 s[4:5], s[4:5], 32
+; GFX1030-NEXT: s_or_b32 s4, s4, s7
; GFX1030-NEXT: v_add_co_u32 v4, s4, v2, s4
; GFX1030-NEXT: s_cmp_lg_u32 s4, 0
-; GFX1030-NEXT: s_addc_u32 s4, s5, s6
+; GFX1030-NEXT: s_addc_u32 s4, s6, s5
; GFX1030-NEXT: v_mul_hi_u32 v8, v0, v4
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, s4, 0
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, 0
@@ -2728,32 +2750,35 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX1030-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v5, vcc_lo
; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
-; GFX1030-NEXT: v_add_co_u32 v5, vcc_lo, v2, v6
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v3, vcc_lo
-; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x186a0, v5, 0
-; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x186a0, v6, v[3:4]
-; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
+; GFX1030-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX1030-NEXT: v_lshlrev_b64 v[2:3], 32, v[2:3]
+; GFX1030-NEXT: v_or_b32_e32 v7, v2, v4
+; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x186a0, v7, 0
+; GFX1030-NEXT: v_mov_b32_e32 v2, v5
+; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
+; GFX1030-NEXT: v_mad_u64_u32 v[5:6], null, 0x186a0, v3, v[2:3]
+; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
; GFX1030-NEXT: v_subrev_co_u32 v2, vcc_lo, 0x186a0, v0
-; GFX1030-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_subrev_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v2
; GFX1030-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v5, 2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo
+; GFX1030-NEXT: v_add_co_u32 v5, vcc_lo, v7, 2
+; GFX1030-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v3, vcc_lo
; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v0
; GFX1030-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
; GFX1030-NEXT: v_cndmask_b32_e64 v0, -1, v0, s4
; GFX1030-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
-; GFX1030-NEXT: v_add_co_u32 v3, vcc_lo, v5, 1
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo
+; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v7, 1
+; GFX1030-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v3, vcc_lo
; GFX1030-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX1030-NEXT: v_cndmask_b32_e32 v2, v8, v7, vcc_lo
+; GFX1030-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo
+; GFX1030-NEXT: v_cndmask_b32_e32 v2, v8, v6, vcc_lo
; GFX1030-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc_lo
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX1030-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc_lo
+; GFX1030-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: v_test_udiv64_mulhi_fold:
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 48b9c72ea68922..f2992318ed8380 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -20,73 +20,79 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s5, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v6, v1, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s4, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s4, v3
+; GCN-NEXT: v_mul_lo_u32 v5, s5, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT: v_mul_hi_u32 v8, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s5, v0
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_hi_u32 v1, s4, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s4, v2
+; GCN-NEXT: v_mul_lo_u32 v6, s5, v3
+; GCN-NEXT: v_mul_lo_u32 v0, s4, v3
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v6, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v5, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v2, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v2, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_lo_u32 v3, s4, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT: v_mul_hi_u32 v4, s2, v1
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v3, s2, v1
+; GCN-NEXT: v_mul_hi_u32 v4, s2, v0
+; GCN-NEXT: v_mul_hi_u32 v5, s2, v1
+; GCN-NEXT: v_mul_hi_u32 v2, s3, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s3, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
; GCN-NEXT: v_mul_hi_u32 v5, s3, v1
; GCN-NEXT: v_mul_lo_u32 v1, s3, v1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, s3, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v2, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s8, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s8, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s9, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
; GCN-NEXT: v_mov_b32_e32 v5, s9
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
+; GCN-NEXT: v_mul_hi_u32 v2, s8, v0
+; GCN-NEXT: v_mul_lo_u32 v3, s8, v1
+; GCN-NEXT: v_mul_lo_u32 v4, s9, v0
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GCN-NEXT: v_mul_lo_u32 v3, s8, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
@@ -203,78 +209,84 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
+; GCN-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc
; GCN-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4
; GCN-NEXT: v_rcp_f32_e32 v4, v4
; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; GCN-NEXT: v_trunc_f32_e32 v5, v5
; GCN-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4
-; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GCN-NEXT: v_mul_lo_u32 v8, v6, v5
-; GCN-NEXT: v_mul_hi_u32 v9, v6, v4
+; GCN-NEXT: v_cvt_u32_f32_e32 v6, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v7, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v8, v6
+; GCN-NEXT: v_mul_hi_u32 v5, v8, v7
+; GCN-NEXT: v_mul_lo_u32 v10, v9, v7
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v8, v7
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; GCN-NEXT: v_mul_lo_u32 v10, v7, v4
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_mul_lo_u32 v9, v6, v4
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GCN-NEXT: v_mul_lo_u32 v10, v4, v8
-; GCN-NEXT: v_mul_hi_u32 v11, v4, v9
-; GCN-NEXT: v_mul_hi_u32 v12, v4, v8
-; GCN-NEXT: v_mul_hi_u32 v13, v5, v8
-; GCN-NEXT: v_mul_lo_u32 v8, v5, v8
+; GCN-NEXT: v_mul_hi_u32 v11, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v12, v7, v4
+; GCN-NEXT: v_mul_hi_u32 v13, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v6, v4
; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT: v_mul_lo_u32 v12, v5, v9
-; GCN-NEXT: v_mul_hi_u32 v9, v5, v9
+; GCN-NEXT: v_mul_lo_u32 v12, v6, v5
+; GCN-NEXT: v_mul_hi_u32 v5, v6, v5
; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc
; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc
+; GCN-NEXT: v_add_i32_e32 v11, vcc, v5, v4
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], 32
+; GCN-NEXT: v_or_b32_e32 v4, v4, v11
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v5, vcc
+; GCN-NEXT: v_mul_lo_u32 v4, v8, v7
+; GCN-NEXT: v_mul_hi_u32 v5, v8, v7
+; GCN-NEXT: v_mul_lo_u32 v8, v8, v6
+; GCN-NEXT: v_mul_lo_u32 v9, v9, v7
+; GCN-NEXT: v_mul_hi_u32 v10, v6, v4
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GCN-NEXT: v_mul_lo_u32 v8, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v9, v7, v4
+; GCN-NEXT: v_mul_hi_u32 v11, v7, v5
+; GCN-NEXT: v_mul_lo_u32 v4, v6, v4
; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc
-; GCN-NEXT: v_mul_lo_u32 v8, v6, v5
-; GCN-NEXT: v_mul_hi_u32 v9, v6, v4
-; GCN-NEXT: v_mul_lo_u32 v7, v7, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v6, v4
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc
+; GCN-NEXT: v_mul_hi_u32 v11, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v5, v6, v5
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v10, vcc
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v4, v5
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], 32
+; GCN-NEXT: v_or_b32_e32 v4, v4, v9
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; GCN-NEXT: v_mul_lo_u32 v7, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v8, v0, v4
+; GCN-NEXT: v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v6, v1, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v1, v4
; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT: v_mul_lo_u32 v10, v4, v7
-; GCN-NEXT: v_mul_hi_u32 v11, v4, v6
-; GCN-NEXT: v_mul_hi_u32 v12, v4, v7
-; GCN-NEXT: v_mul_hi_u32 v9, v5, v6
-; GCN-NEXT: v_mul_lo_u32 v6, v5, v6
-; GCN-NEXT: v_mul_hi_u32 v8, v5, v7
-; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT: v_mul_lo_u32 v7, v5, v7
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v5
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v5
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc
; GCN-NEXT: v_mul_hi_u32 v9, v1, v5
; GCN-NEXT: v_mul_lo_u32 v5, v1, v5
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v8, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v7, v2, v4
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v4, v5
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], 32
+; GCN-NEXT: v_or_b32_e32 v4, v4, v7
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v7, v2, v5
; GCN-NEXT: v_mul_lo_u32 v8, v3, v4
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GCN-NEXT: v_mul_lo_u32 v7, v2, v4
; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; GCN-NEXT: v_sub_i32_e32 v8, vcc, v1, v6
@@ -717,113 +729,118 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
; GCN-LABEL: s_test_udiv24_i48:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_and_b32 s0, s2, 0xff000000
-; GCN-NEXT: s_and_b32 s1, s3, 0xffff
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24
+; GCN-NEXT: s_and_b32 s4, s2, 0xff000000
+; GCN-NEXT: s_and_b32 s5, s3, 0xffff
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0
-; GCN-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NEXT: s_and_b32 s6, s6, 0xff000000
-; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 24
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: v_mac_f32_e32 v1, 0, v2
; GCN-NEXT: v_rcp_f32_e32 v1, v1
-; GCN-NEXT: s_sub_u32 s8, 0, s0
-; GCN-NEXT: s_subb_u32 s9, 0, s1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s6, s3, 0xffff
+; GCN-NEXT: s_and_b32 s7, s2, 0xff000000
+; GCN-NEXT: s_lshr_b64 s[2:3], s[4:5], 24
; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
; GCN-NEXT: v_trunc_f32_e32 v2, v2
; GCN-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2
+; GCN-NEXT: v_cvt_u32_f32_e32 v4, v1
+; GCN-NEXT: s_sub_u32 s2, 0, s2
+; GCN-NEXT: s_subb_u32 s4, 0, s3
+; GCN-NEXT: v_mul_lo_u32 v1, s2, v3
+; GCN-NEXT: v_mul_hi_u32 v2, s2, v4
+; GCN-NEXT: v_mul_lo_u32 v6, s4, v4
+; GCN-NEXT: v_mul_lo_u32 v5, s2, v4
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_mul_hi_u32 v2, v4, v5
+; GCN-NEXT: v_mul_lo_u32 v6, v4, v1
+; GCN-NEXT: v_mul_hi_u32 v7, v4, v1
+; GCN-NEXT: v_mul_hi_u32 v8, v3, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v3, v1
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_lo_u32 v7, v3, v5
+; GCN-NEXT: v_mul_hi_u32 v5, v3, v5
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v2, v1
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN-NEXT: v_lshl_b64 v[1:2], v[1:2], 32
+; GCN-NEXT: v_or_b32_e32 v1, v1, v6
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v1
+; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v2, vcc
+; GCN-NEXT: v_mul_hi_u32 v2, s2, v4
+; GCN-NEXT: v_mul_lo_u32 v5, s2, v3
+; GCN-NEXT: v_mul_lo_u32 v7, s4, v4
+; GCN-NEXT: v_mul_lo_u32 v1, s2, v4
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: v_mul_lo_u32 v3, s8, v2
-; GCN-NEXT: v_mul_hi_u32 v4, s8, v1
-; GCN-NEXT: v_mul_lo_u32 v5, s9, v1
-; GCN-NEXT: v_mul_lo_u32 v6, s8, v1
-; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v6
-; GCN-NEXT: v_mul_hi_u32 v7, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v2, v3
-; GCN-NEXT: v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v7, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v6, v2, v6
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; GCN-NEXT: v_mul_lo_u32 v5, v4, v2
+; GCN-NEXT: v_mul_hi_u32 v7, v4, v1
+; GCN-NEXT: v_mul_hi_u32 v8, v4, v2
+; GCN-NEXT: v_mul_hi_u32 v6, v3, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v3, v1
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT: v_mul_hi_u32 v8, v3, v2
+; GCN-NEXT: v_mul_lo_u32 v2, v3, v2
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT: v_mul_lo_u32 v3, s8, v2
-; GCN-NEXT: v_mul_hi_u32 v4, s8, v1
-; GCN-NEXT: v_mul_lo_u32 v5, s9, v1
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_mul_lo_u32 v4, s8, v1
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT: v_mul_lo_u32 v7, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v9, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
-; GCN-NEXT: v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT: v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT: v_mov_b32_e32 v3, s6
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT: v_alignbit_b32 v3, s7, v3, 24
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v1, v2
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN-NEXT: v_lshl_b64 v[1:2], v[1:2], 32
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: v_or_b32_e32 v1, v1, v6
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc
+; GCN-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NEXT: v_alignbit_b32 v3, s6, v3, 24
; GCN-NEXT: v_mul_lo_u32 v4, v3, v2
; GCN-NEXT: v_mul_hi_u32 v1, v3, v1
; GCN-NEXT: v_mul_hi_u32 v2, v3, v2
-; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v1
-; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1
-; GCN-NEXT: v_mul_lo_u32 v10, v0, v1
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
+; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v1
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GCN-NEXT: v_lshl_b64 v[1:2], v[1:2], 32
+; GCN-NEXT: v_or_b32_e32 v1, v1, v4
+; GCN-NEXT: v_mul_hi_u32 v4, v0, v1
+; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT: v_mul_lo_u32 v6, v0, v1
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v0
+; GCN-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v4, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v0
+; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
+; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
+; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v1
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc
; GCN-NEXT: v_add_i32_e32 v8, vcc, 2, v1
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v10
-; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v6, vcc
-; GCN-NEXT: v_sub_i32_e32 v7, vcc, v3, v0
-; GCN-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v6, vcc
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v0
-; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
-; GCN-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; GCN-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -934,58 +951,62 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s5, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v6, v1, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s4, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s4, v3
+; GCN-NEXT: v_mul_lo_u32 v5, s5, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT: v_mul_hi_u32 v8, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s5, v0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_lo_u32 v3, s4, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_hi_u32 v1, s4, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s4, v2
+; GCN-NEXT: v_mul_lo_u32 v6, s5, v3
+; GCN-NEXT: v_mul_lo_u32 v0, s4, v3
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v6, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v5, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v2, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v2, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GCN-NEXT: v_mul_lo_u32 v2, v1, 24
; GCN-NEXT: v_mul_hi_u32 v0, v0, 24
; GCN-NEXT: v_mul_hi_u32 v1, v1, 24
-; GCN-NEXT: v_mov_b32_e32 v4, s3
-; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc
; GCN-NEXT: v_mul_lo_u32 v1, s3, v0
@@ -1104,67 +1125,72 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
-; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v0
+; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v1, vcc
; GCN-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
; GCN-NEXT: v_rcp_f32_e32 v2, v2
+; GCN-NEXT: s_mov_b32 s4, 0x8000
; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GCN-NEXT: v_trunc_f32_e32 v3, v3
; GCN-NEXT: v_madmk_f32 v2, v3, 0xcf800000, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_mul_lo_u32 v6, v4, v3
-; GCN-NEXT: v_mul_hi_u32 v7, v4, v2
+; GCN-NEXT: v_cvt_u32_f32_e32 v4, v3
+; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2
+; GCN-NEXT: v_mul_lo_u32 v2, v6, v4
+; GCN-NEXT: v_mul_hi_u32 v3, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v8, v7, v5
+; GCN-NEXT: v_mul_lo_u32 v9, v6, v5
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; GCN-NEXT: v_mul_hi_u32 v3, v5, v9
; GCN-NEXT: v_mul_lo_u32 v8, v5, v2
-; GCN-NEXT: v_mul_lo_u32 v9, v4, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; GCN-NEXT: v_mul_hi_u32 v7, v2, v9
-; GCN-NEXT: v_mul_lo_u32 v8, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v10, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v11, v3, v6
-; GCN-NEXT: v_mul_lo_u32 v6, v3, v6
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GCN-NEXT: v_mul_hi_u32 v10, v5, v2
+; GCN-NEXT: v_mul_hi_u32 v11, v4, v2
+; GCN-NEXT: v_mul_lo_u32 v2, v4, v2
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc
-; GCN-NEXT: v_mul_lo_u32 v10, v3, v9
-; GCN-NEXT: v_mul_hi_u32 v9, v3, v9
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v9, vcc
+; GCN-NEXT: v_mul_lo_u32 v10, v4, v9
+; GCN-NEXT: v_mul_hi_u32 v9, v4, v9
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v10
+; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v9, vcc
; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v3, v2
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v8, vcc
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 32
+; GCN-NEXT: v_or_b32_e32 v2, v2, v9
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v2
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v3, vcc
+; GCN-NEXT: v_mul_lo_u32 v2, v6, v5
+; GCN-NEXT: v_mul_hi_u32 v3, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v6, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v7, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v8, v4, v2
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GCN-NEXT: v_mul_lo_u32 v6, v5, v3
+; GCN-NEXT: v_mul_hi_u32 v7, v5, v2
+; GCN-NEXT: v_mul_hi_u32 v9, v5, v3
+; GCN-NEXT: v_mul_lo_u32 v2, v4, v2
; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v4, v3
-; GCN-NEXT: v_mul_hi_u32 v7, v4, v2
-; GCN-NEXT: v_mul_lo_u32 v5, v5, v2
-; GCN-NEXT: v_mul_lo_u32 v4, v4, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT: v_mul_lo_u32 v8, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v9, v2, v4
-; GCN-NEXT: v_mul_hi_u32 v10, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v7, v3, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v3, v4
-; GCN-NEXT: v_mul_hi_u32 v6, v3, v5
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT: v_mul_lo_u32 v5, v3, v5
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc
+; GCN-NEXT: v_mul_hi_u32 v9, v4, v3
+; GCN-NEXT: v_mul_lo_u32 v3, v4, v3
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v8, vcc
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v2, v3
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 32
+; GCN-NEXT: v_or_b32_e32 v2, v2, v7
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v4, v3, vcc
; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2
; GCN-NEXT: v_mul_lo_u32 v3, v1, v2
; GCN-NEXT: v_mul_hi_u32 v4, v0, v2
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GCN-NEXT: v_mul_lo_u32 v4, v0, v2
; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0x8000, v4
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v4
; GCN-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v1, vcc
; GCN-NEXT: v_sub_i32_e64 v6, s[4:5], v4, v0
; GCN-NEXT: v_subbrev_u32_e64 v5, s[4:5], 0, v5, s[4:5]
@@ -1364,80 +1390,86 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) {
; GCN-LABEL: s_test_udiv_k_den_i64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_add_u32 s1, 0, 0xaaaa0000
+; GCN-NEXT: s_add_u32 s4, 0, 0xaaaa0000
+; GCN-NEXT: s_addc_u32 s2, 0, 42
+; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 32
+; GCN-NEXT: s_or_b32 s8, s2, s4
; GCN-NEXT: v_mov_b32_e32 v0, 0xffffffe8
-; GCN-NEXT: v_mul_hi_u32 v0, s1, v0
-; GCN-NEXT: s_addc_u32 s8, 0, 42
-; GCN-NEXT: s_add_i32 s8, s8, 0xaaaaa80
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_mul_i32 s4, s1, 0xffffffe8
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s1, v0
-; GCN-NEXT: s_mul_i32 s9, s8, 0xffffffe8
-; GCN-NEXT: v_mov_b32_e32 v1, s4
-; GCN-NEXT: v_add_i32_e32 v0, vcc, s9, v0
-; GCN-NEXT: v_mul_hi_u32 v2, s8, v1
-; GCN-NEXT: v_mul_lo_u32 v3, s1, v0
-; GCN-NEXT: v_mul_hi_u32 v1, s1, v1
-; GCN-NEXT: v_mul_hi_u32 v4, s1, v0
-; GCN-NEXT: s_mul_i32 s4, s8, s4
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_add_i32 s0, s3, 0xaaaaa80
+; GCN-NEXT: s_mul_i32 s1, s0, 0xffffffe8
+; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v0
+; GCN-NEXT: s_mul_i32 s1, s8, 0xffffffe8
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mul_lo_u32 v1, s8, v0
+; GCN-NEXT: v_mul_hi_u32 v3, s8, v2
; GCN-NEXT: v_mul_hi_u32 v4, s8, v0
-; GCN-NEXT: v_mul_lo_u32 v0, s8, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, s4, v1
+; GCN-NEXT: v_mul_hi_u32 v2, s0, v2
+; GCN-NEXT: s_mul_i32 s1, s0, s1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT: v_mul_hi_u32 v4, s0, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s0, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, s1, v1
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT: v_mov_b32_e32 v2, s8
-; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s8, v0
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s6, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s6, v0
-; GCN-NEXT: v_mul_hi_u32 v4, s6, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mul_lo_u32 v3, s6, v1
+; GCN-NEXT: v_mul_hi_u32 v4, s6, v0
+; GCN-NEXT: v_mul_hi_u32 v5, s6, v1
+; GCN-NEXT: v_mul_hi_u32 v2, s7, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s7, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
; GCN-NEXT: v_mul_hi_u32 v5, s7, v1
; GCN-NEXT: v_mul_lo_u32 v1, s7, v1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, s7, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v2, vcc
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
+; GCN-NEXT: v_mul_hi_u32 v2, v0, 24
+; GCN-NEXT: v_mul_lo_u32 v3, v1, 24
+; GCN-NEXT: v_mul_lo_u32 v4, v0, 24
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, s6, v4
+; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc
+; GCN-NEXT: v_subrev_i32_e32 v3, vcc, 24, v4
+; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v3
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v0
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v4
+; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, v1, 24
-; GCN-NEXT: v_mul_hi_u32 v5, v0, 24
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT: v_mul_lo_u32 v8, v0, 24
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v0
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT: v_mov_b32_e32 v5, s7
-; GCN-NEXT: v_sub_i32_e32 v8, vcc, s6, v8
-; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc
-; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 24, v8
-; GCN-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v5
-; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
-; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v8
-; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GCN-NEXT: v_cndmask_b32_e32 v4, -1, v5, vcc
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
@@ -1512,51 +1544,57 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
; GCN-LABEL: v_test_udiv_k_den_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_add_u32 s4, 0, 0xaaaa0000
+; GCN-NEXT: s_add_u32 s6, 0, 0xaaaa0000
+; GCN-NEXT: s_addc_u32 s4, 0, 42
+; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 32
+; GCN-NEXT: s_or_b32 s4, s4, s6
; GCN-NEXT: v_mov_b32_e32 v2, 0xffffffe8
; GCN-NEXT: v_mul_hi_u32 v2, s4, v2
-; GCN-NEXT: s_addc_u32 s5, 0, 42
; GCN-NEXT: s_add_i32 s5, s5, 0xaaaaa80
-; GCN-NEXT: s_mul_i32 s6, s4, 0xffffffe8
+; GCN-NEXT: s_mul_i32 s6, s5, 0xffffffe8
; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2
-; GCN-NEXT: s_mul_i32 s7, s5, 0xffffffe8
-; GCN-NEXT: v_mov_b32_e32 v3, s6
-; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2
-; GCN-NEXT: v_mul_hi_u32 v4, s5, v3
-; GCN-NEXT: v_mul_lo_u32 v5, s4, v2
-; GCN-NEXT: v_mul_hi_u32 v3, s4, v3
+; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2
+; GCN-NEXT: s_mul_i32 s6, s4, 0xffffffe8
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mul_lo_u32 v3, s4, v2
+; GCN-NEXT: v_mul_hi_u32 v5, s4, v4
; GCN-NEXT: v_mul_hi_u32 v6, s4, v2
+; GCN-NEXT: v_mul_hi_u32 v4, s5, v4
; GCN-NEXT: s_mul_i32 s6, s5, s6
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
; GCN-NEXT: v_mul_hi_u32 v6, s5, v2
; GCN-NEXT: v_mul_lo_u32 v2, s5, v2
; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3
; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v3, v2
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 32
; GCN-NEXT: v_mov_b32_e32 v4, s5
+; GCN-NEXT: v_or_b32_e32 v2, v2, v5
; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v5, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v6, v0, v3
+; GCN-NEXT: v_mul_lo_u32 v5, v0, v3
+; GCN-NEXT: v_mul_hi_u32 v6, v0, v2
+; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
+; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
+; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
; GCN-NEXT: v_mul_hi_u32 v7, v1, v3
; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v1, v2
-; GCN-NEXT: v_mul_hi_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, v3, 24
-; GCN-NEXT: v_mul_hi_u32 v5, v2, 24
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v3
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 32
+; GCN-NEXT: v_or_b32_e32 v2, v2, v5
+; GCN-NEXT: v_mul_hi_u32 v4, v2, 24
+; GCN-NEXT: v_mul_lo_u32 v5, v3, 24
; GCN-NEXT: v_mul_lo_u32 v6, v2, 24
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
; GCN-NEXT: v_subrev_i32_e32 v4, vcc, 24, v0
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index a5e1506114f2d0..274fb517868d15 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -257,41 +257,42 @@ define hidden void @blam() {
; GCN-NEXT: s_mov_b32 s16, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
-; GCN-NEXT: v_writelane_b32 v45, s16, 26
+; GCN-NEXT: v_writelane_b32 v46, s16, 26
; GCN-NEXT: s_addk_i32 s32, 0x800
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_writelane_b32 v45, s30, 0
-; GCN-NEXT: v_writelane_b32 v45, s31, 1
-; GCN-NEXT: v_writelane_b32 v45, s34, 2
-; GCN-NEXT: v_writelane_b32 v45, s35, 3
-; GCN-NEXT: v_writelane_b32 v45, s36, 4
-; GCN-NEXT: v_writelane_b32 v45, s37, 5
-; GCN-NEXT: v_writelane_b32 v45, s38, 6
-; GCN-NEXT: v_writelane_b32 v45, s39, 7
-; GCN-NEXT: v_writelane_b32 v45, s40, 8
-; GCN-NEXT: v_writelane_b32 v45, s41, 9
-; GCN-NEXT: v_writelane_b32 v45, s42, 10
-; GCN-NEXT: v_writelane_b32 v45, s43, 11
-; GCN-NEXT: v_writelane_b32 v45, s44, 12
-; GCN-NEXT: v_writelane_b32 v45, s45, 13
-; GCN-NEXT: v_writelane_b32 v45, s46, 14
-; GCN-NEXT: v_writelane_b32 v45, s47, 15
-; GCN-NEXT: v_writelane_b32 v45, s48, 16
-; GCN-NEXT: v_writelane_b32 v45, s49, 17
-; GCN-NEXT: v_writelane_b32 v45, s50, 18
-; GCN-NEXT: v_writelane_b32 v45, s51, 19
-; GCN-NEXT: v_writelane_b32 v45, s52, 20
-; GCN-NEXT: v_writelane_b32 v45, s53, 21
-; GCN-NEXT: v_writelane_b32 v45, s54, 22
-; GCN-NEXT: v_writelane_b32 v45, s55, 23
-; GCN-NEXT: v_writelane_b32 v45, s56, 24
-; GCN-NEXT: v_writelane_b32 v45, s57, 25
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: v_writelane_b32 v46, s30, 0
+; GCN-NEXT: v_writelane_b32 v46, s31, 1
+; GCN-NEXT: v_writelane_b32 v46, s34, 2
+; GCN-NEXT: v_writelane_b32 v46, s35, 3
+; GCN-NEXT: v_writelane_b32 v46, s36, 4
+; GCN-NEXT: v_writelane_b32 v46, s37, 5
+; GCN-NEXT: v_writelane_b32 v46, s38, 6
+; GCN-NEXT: v_writelane_b32 v46, s39, 7
+; GCN-NEXT: v_writelane_b32 v46, s40, 8
+; GCN-NEXT: v_writelane_b32 v46, s41, 9
+; GCN-NEXT: v_writelane_b32 v46, s42, 10
+; GCN-NEXT: v_writelane_b32 v46, s43, 11
+; GCN-NEXT: v_writelane_b32 v46, s44, 12
+; GCN-NEXT: v_writelane_b32 v46, s45, 13
+; GCN-NEXT: v_writelane_b32 v46, s46, 14
+; GCN-NEXT: v_writelane_b32 v46, s47, 15
+; GCN-NEXT: v_writelane_b32 v46, s48, 16
+; GCN-NEXT: v_writelane_b32 v46, s49, 17
+; GCN-NEXT: v_writelane_b32 v46, s50, 18
+; GCN-NEXT: v_writelane_b32 v46, s51, 19
+; GCN-NEXT: v_writelane_b32 v46, s52, 20
+; GCN-NEXT: v_writelane_b32 v46, s53, 21
+; GCN-NEXT: v_writelane_b32 v46, s54, 22
+; GCN-NEXT: v_writelane_b32 v46, s55, 23
+; GCN-NEXT: v_writelane_b32 v46, s56, 24
+; GCN-NEXT: v_writelane_b32 v46, s57, 25
; GCN-NEXT: s_mov_b64 s[34:35], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v40, v31
; GCN-NEXT: s_mov_b32 s46, s15
@@ -303,15 +304,15 @@ define hidden void @blam() {
; GCN-NEXT: s_mov_b64 s[40:41], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40
-; GCN-NEXT: flat_load_dword v43, v[0:1]
+; GCN-NEXT: v_and_b32_e32 v41, 0x3ff, v40
+; GCN-NEXT: flat_load_dword v45, v[0:1]
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: s_mov_b64 s[50:51], 0
-; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v2
+; GCN-NEXT: v_lshl_b64 v[43:44], v[41:42], 2
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cmp_eq_f32_e64 s[52:53], 0, v43
-; GCN-NEXT: v_cmp_neq_f32_e64 s[42:43], 0, v43
-; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000
+; GCN-NEXT: v_cmp_eq_f32_e64 s[52:53], 0, v45
+; GCN-NEXT: v_cmp_neq_f32_e64 s[42:43], 0, v45
+; GCN-NEXT: v_mov_b32_e32 v41, 0x7fc00000
; GCN-NEXT: s_branch .LBB1_2
; GCN-NEXT: .LBB1_1: ; %Flow7
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
@@ -322,7 +323,7 @@ define hidden void @blam() {
; GCN-NEXT: s_cbranch_execz .LBB1_18
; GCN-NEXT: .LBB1_2: ; %bb2
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: flat_load_dword v0, v[41:42]
+; GCN-NEXT: flat_load_dword v0, v[43:44]
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_waitcnt vmcnt(1)
@@ -362,7 +363,7 @@ define hidden void @blam() {
; GCN-NEXT: s_cbranch_execz .LBB1_7
; GCN-NEXT: ; %bb.6: ; %bb16
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0
; GCN-NEXT: s_or_b64 s[8:9], s[52:53], exec
; GCN-NEXT: .LBB1_7: ; %Flow3
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
@@ -374,7 +375,7 @@ define hidden void @blam() {
; GCN-NEXT: ; %bb.8: ; %bb17
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_mov_b64 s[6:7], exec
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0
; GCN-NEXT: .LBB1_9: ; %Flow4
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
@@ -404,7 +405,7 @@ define hidden void @blam() {
; GCN-NEXT: s_cbranch_execz .LBB1_15
; GCN-NEXT: ; %bb.14: ; %bb10
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0
; GCN-NEXT: s_or_b64 s[10:11], s[6:7], exec
; GCN-NEXT: .LBB1_15: ; %Flow6
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
@@ -422,45 +423,46 @@ define hidden void @blam() {
; GCN-NEXT: s_cbranch_execz .LBB1_1
; GCN-NEXT: ; %bb.17: ; %bb18
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0
; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
; GCN-NEXT: s_branch .LBB1_1
; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock
; GCN-NEXT: s_or_b64 exec, exec, s[50:51]
-; GCN-NEXT: v_readlane_b32 s57, v45, 25
-; GCN-NEXT: v_readlane_b32 s56, v45, 24
-; GCN-NEXT: v_readlane_b32 s55, v45, 23
-; GCN-NEXT: v_readlane_b32 s54, v45, 22
-; GCN-NEXT: v_readlane_b32 s53, v45, 21
-; GCN-NEXT: v_readlane_b32 s52, v45, 20
-; GCN-NEXT: v_readlane_b32 s51, v45, 19
-; GCN-NEXT: v_readlane_b32 s50, v45, 18
-; GCN-NEXT: v_readlane_b32 s49, v45, 17
-; GCN-NEXT: v_readlane_b32 s48, v45, 16
-; GCN-NEXT: v_readlane_b32 s47, v45, 15
-; GCN-NEXT: v_readlane_b32 s46, v45, 14
-; GCN-NEXT: v_readlane_b32 s45, v45, 13
-; GCN-NEXT: v_readlane_b32 s44, v45, 12
-; GCN-NEXT: v_readlane_b32 s43, v45, 11
-; GCN-NEXT: v_readlane_b32 s42, v45, 10
-; GCN-NEXT: v_readlane_b32 s41, v45, 9
-; GCN-NEXT: v_readlane_b32 s40, v45, 8
-; GCN-NEXT: v_readlane_b32 s39, v45, 7
-; GCN-NEXT: v_readlane_b32 s38, v45, 6
-; GCN-NEXT: v_readlane_b32 s37, v45, 5
-; GCN-NEXT: v_readlane_b32 s36, v45, 4
-; GCN-NEXT: v_readlane_b32 s35, v45, 3
-; GCN-NEXT: v_readlane_b32 s34, v45, 2
-; GCN-NEXT: v_readlane_b32 s31, v45, 1
-; GCN-NEXT: v_readlane_b32 s30, v45, 0
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: v_readlane_b32 s4, v45, 26
+; GCN-NEXT: v_readlane_b32 s57, v46, 25
+; GCN-NEXT: v_readlane_b32 s56, v46, 24
+; GCN-NEXT: v_readlane_b32 s55, v46, 23
+; GCN-NEXT: v_readlane_b32 s54, v46, 22
+; GCN-NEXT: v_readlane_b32 s53, v46, 21
+; GCN-NEXT: v_readlane_b32 s52, v46, 20
+; GCN-NEXT: v_readlane_b32 s51, v46, 19
+; GCN-NEXT: v_readlane_b32 s50, v46, 18
+; GCN-NEXT: v_readlane_b32 s49, v46, 17
+; GCN-NEXT: v_readlane_b32 s48, v46, 16
+; GCN-NEXT: v_readlane_b32 s47, v46, 15
+; GCN-NEXT: v_readlane_b32 s46, v46, 14
+; GCN-NEXT: v_readlane_b32 s45, v46, 13
+; GCN-NEXT: v_readlane_b32 s44, v46, 12
+; GCN-NEXT: v_readlane_b32 s43, v46, 11
+; GCN-NEXT: v_readlane_b32 s42, v46, 10
+; GCN-NEXT: v_readlane_b32 s41, v46, 9
+; GCN-NEXT: v_readlane_b32 s40, v46, 8
+; GCN-NEXT: v_readlane_b32 s39, v46, 7
+; GCN-NEXT: v_readlane_b32 s38, v46, 6
+; GCN-NEXT: v_readlane_b32 s37, v46, 5
+; GCN-NEXT: v_readlane_b32 s36, v46, 4
+; GCN-NEXT: v_readlane_b32 s35, v46, 3
+; GCN-NEXT: v_readlane_b32 s34, v46, 2
+; GCN-NEXT: v_readlane_b32 s31, v46, 1
+; GCN-NEXT: v_readlane_b32 s30, v46, 0
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: v_readlane_b32 s4, v46, 26
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xf800
; GCN-NEXT: s_mov_b32 s33, s4
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index f35589853393c5..77e19e4a31b977 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -12,8 +12,8 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13
-; GCN-NEXT: s_sub_u32 s0, 0, s12
-; GCN-NEXT: s_subb_u32 s1, 0, s13
+; GCN-NEXT: s_sub_u32 s2, 0, s12
+; GCN-NEXT: s_subb_u32 s3, 0, s13
; GCN-NEXT: s_mov_b32 s4, s8
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
@@ -22,68 +22,74 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_mul_lo_u32 v2, s0, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s0, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s1, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v6, v1, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s2, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s2, v3
+; GCN-NEXT: v_mul_lo_u32 v5, s3, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT: v_mul_hi_u32 v8, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s0, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s0, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s1, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_lo_u32 v3, s0, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT: v_mul_hi_u32 v4, s10, v1
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_hi_u32 v1, s2, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s2, v2
+; GCN-NEXT: v_mul_lo_u32 v6, s3, v3
+; GCN-NEXT: v_mul_lo_u32 v0, s2, v3
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v6, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v5, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v2, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v2, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_lo_u32 v3, s10, v1
+; GCN-NEXT: v_mul_hi_u32 v4, s10, v0
+; GCN-NEXT: v_mul_hi_u32 v5, s10, v1
+; GCN-NEXT: v_mul_hi_u32 v2, s11, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s11, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
; GCN-NEXT: v_mul_hi_u32 v5, s11, v1
; GCN-NEXT: v_mul_lo_u32 v1, s11, v1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, s11, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v2, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT: v_mul_lo_u32 v1, s12, v1
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
; GCN-NEXT: v_mul_hi_u32 v2, s12, v0
+; GCN-NEXT: v_mul_lo_u32 v1, s12, v1
; GCN-NEXT: v_mul_lo_u32 v3, s13, v0
; GCN-NEXT: v_mul_lo_u32 v0, s12, v0
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
@@ -213,76 +219,82 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
+; GCN-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc
; GCN-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4
; GCN-NEXT: v_rcp_f32_e32 v4, v4
; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; GCN-NEXT: v_trunc_f32_e32 v5, v5
; GCN-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4
-; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GCN-NEXT: v_mul_lo_u32 v8, v6, v5
-; GCN-NEXT: v_mul_hi_u32 v9, v6, v4
+; GCN-NEXT: v_cvt_u32_f32_e32 v6, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v7, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v8, v6
+; GCN-NEXT: v_mul_hi_u32 v5, v8, v7
+; GCN-NEXT: v_mul_lo_u32 v10, v9, v7
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v8, v7
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; GCN-NEXT: v_mul_lo_u32 v10, v7, v4
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_mul_lo_u32 v9, v6, v4
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GCN-NEXT: v_mul_lo_u32 v10, v4, v8
-; GCN-NEXT: v_mul_hi_u32 v11, v4, v9
-; GCN-NEXT: v_mul_hi_u32 v12, v4, v8
-; GCN-NEXT: v_mul_hi_u32 v13, v5, v8
-; GCN-NEXT: v_mul_lo_u32 v8, v5, v8
+; GCN-NEXT: v_mul_hi_u32 v11, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v12, v7, v4
+; GCN-NEXT: v_mul_hi_u32 v13, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v6, v4
; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT: v_mul_lo_u32 v12, v5, v9
-; GCN-NEXT: v_mul_hi_u32 v9, v5, v9
+; GCN-NEXT: v_mul_lo_u32 v12, v6, v5
+; GCN-NEXT: v_mul_hi_u32 v5, v6, v5
; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc
; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc
+; GCN-NEXT: v_add_i32_e32 v11, vcc, v5, v4
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], 32
+; GCN-NEXT: v_or_b32_e32 v4, v4, v11
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v5, vcc
+; GCN-NEXT: v_mul_lo_u32 v4, v8, v7
+; GCN-NEXT: v_mul_hi_u32 v5, v8, v7
+; GCN-NEXT: v_mul_lo_u32 v8, v8, v6
+; GCN-NEXT: v_mul_lo_u32 v9, v9, v7
+; GCN-NEXT: v_mul_hi_u32 v10, v6, v4
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GCN-NEXT: v_mul_lo_u32 v8, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v9, v7, v4
+; GCN-NEXT: v_mul_hi_u32 v11, v7, v5
+; GCN-NEXT: v_mul_lo_u32 v4, v6, v4
; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc
-; GCN-NEXT: v_mul_lo_u32 v8, v6, v5
-; GCN-NEXT: v_mul_hi_u32 v9, v6, v4
-; GCN-NEXT: v_mul_lo_u32 v7, v7, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v6, v4
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc
+; GCN-NEXT: v_mul_hi_u32 v11, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v5, v6, v5
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v10, vcc
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v4, v5
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], 32
+; GCN-NEXT: v_or_b32_e32 v4, v4, v9
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; GCN-NEXT: v_mul_lo_u32 v7, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v8, v0, v4
+; GCN-NEXT: v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v6, v1, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v1, v4
; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT: v_mul_lo_u32 v10, v4, v7
-; GCN-NEXT: v_mul_hi_u32 v11, v4, v6
-; GCN-NEXT: v_mul_hi_u32 v12, v4, v7
-; GCN-NEXT: v_mul_hi_u32 v9, v5, v6
-; GCN-NEXT: v_mul_lo_u32 v6, v5, v6
-; GCN-NEXT: v_mul_hi_u32 v8, v5, v7
-; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT: v_mul_lo_u32 v7, v5, v7
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc
-; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v5
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v5
+; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc
; GCN-NEXT: v_mul_hi_u32 v9, v1, v5
; GCN-NEXT: v_mul_lo_u32 v5, v1, v5
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v8, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GCN-NEXT: v_mul_lo_u32 v5, v2, v5
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v4, v5
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], 32
+; GCN-NEXT: v_or_b32_e32 v4, v4, v7
; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v2, v5
; GCN-NEXT: v_mul_lo_u32 v7, v3, v4
; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
@@ -793,52 +805,56 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_mul_lo_u32 v2, s0, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s0, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s1, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v6, v1, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v4
-; GCN-NEXT: v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s0, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s0, v3
+; GCN-NEXT: v_mul_lo_u32 v5, s1, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s0, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
+; GCN-NEXT: v_mul_lo_u32 v5, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT: v_mul_hi_u32 v8, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v6, vcc
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, s0, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s0, v0
-; GCN-NEXT: v_mul_lo_u32 v4, s1, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_lo_u32 v3, s0, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
-; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v1, vcc
+; GCN-NEXT: v_mul_hi_u32 v1, s0, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s0, v2
+; GCN-NEXT: v_mul_lo_u32 v6, s1, v3
+; GCN-NEXT: v_mul_lo_u32 v0, s0, v3
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v6, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v5, v2, v0
+; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v7, v2, v1
+; GCN-NEXT: v_mul_lo_u32 v1, v2, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v6, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GCN-NEXT: v_mul_lo_u32 v2, v1, 24
; GCN-NEXT: v_mul_hi_u32 v0, v0, 24
; GCN-NEXT: v_mul_hi_u32 v1, v1, 24
@@ -961,56 +977,61 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) {
; GCN-LABEL: s_test_urem_k_den_i64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_add_u32 s0, 0, 0xaaaa0000
+; GCN-NEXT: s_add_u32 s4, 0, 0xaaaa0000
+; GCN-NEXT: s_addc_u32 s2, 0, 42
+; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 32
+; GCN-NEXT: s_or_b32 s8, s2, s4
; GCN-NEXT: v_mov_b32_e32 v0, 0xffffffe8
-; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
-; GCN-NEXT: s_addc_u32 s1, 0, 42
-; GCN-NEXT: s_add_i32 s1, s1, 0xaaaaa80
-; GCN-NEXT: s_mul_i32 s8, s0, 0xffffffe8
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
-; GCN-NEXT: s_mul_i32 s9, s1, 0xffffffe8
-; GCN-NEXT: v_mov_b32_e32 v1, s8
-; GCN-NEXT: v_add_i32_e32 v0, vcc, s9, v0
-; GCN-NEXT: v_mul_hi_u32 v2, s1, v1
-; GCN-NEXT: v_mul_lo_u32 v3, s0, v0
-; GCN-NEXT: v_mul_hi_u32 v1, s0, v1
-; GCN-NEXT: v_mul_hi_u32 v4, s0, v0
-; GCN-NEXT: s_mul_i32 s8, s1, s8
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_add_i32 s0, s3, 0xaaaaa80
+; GCN-NEXT: s_mul_i32 s1, s0, 0xffffffe8
+; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v0
+; GCN-NEXT: s_mul_i32 s1, s8, 0xffffffe8
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mul_lo_u32 v1, s8, v0
+; GCN-NEXT: v_mul_hi_u32 v3, s8, v2
+; GCN-NEXT: v_mul_hi_u32 v4, s8, v0
+; GCN-NEXT: v_mul_hi_u32 v2, s0, v2
+; GCN-NEXT: s_mul_i32 s1, s0, s1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_mul_hi_u32 v4, s1, v0
-; GCN-NEXT: v_mul_lo_u32 v0, s1, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, s8, v1
+; GCN-NEXT: v_mul_hi_u32 v4, s0, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s0, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, s1, v1
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s8, v0
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mul_lo_u32 v2, s6, v1
-; GCN-NEXT: v_mul_hi_u32 v3, s6, v0
-; GCN-NEXT: v_mul_hi_u32 v4, s6, v1
+; GCN-NEXT: v_mul_lo_u32 v3, s6, v1
+; GCN-NEXT: v_mul_hi_u32 v4, s6, v0
+; GCN-NEXT: v_mul_hi_u32 v5, s6, v1
+; GCN-NEXT: v_mul_hi_u32 v2, s7, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s7, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
; GCN-NEXT: v_mul_hi_u32 v5, s7, v1
; GCN-NEXT: v_mul_lo_u32 v1, s7, v1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, s7, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v2, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT: v_mul_lo_u32 v1, v1, 24
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 32
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
; GCN-NEXT: v_mul_hi_u32 v2, v0, 24
+; GCN-NEXT: v_mul_lo_u32 v1, v1, 24
; GCN-NEXT: v_mul_lo_u32 v0, v0, 24
-; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GCN-NEXT: v_mov_b32_e32 v2, s7
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
@@ -1031,6 +1052,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GCN-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1123,67 +1145,72 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
-; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v0
+; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v1, vcc
; GCN-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
; GCN-NEXT: v_rcp_f32_e32 v2, v2
+; GCN-NEXT: s_mov_b32 s4, 0x8000
; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GCN-NEXT: v_trunc_f32_e32 v3, v3
; GCN-NEXT: v_madmk_f32 v2, v3, 0xcf800000, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_mul_lo_u32 v6, v4, v3
-; GCN-NEXT: v_mul_hi_u32 v7, v4, v2
+; GCN-NEXT: v_cvt_u32_f32_e32 v4, v3
+; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2
+; GCN-NEXT: v_mul_lo_u32 v2, v6, v4
+; GCN-NEXT: v_mul_hi_u32 v3, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v8, v7, v5
+; GCN-NEXT: v_mul_lo_u32 v9, v6, v5
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; GCN-NEXT: v_mul_hi_u32 v3, v5, v9
; GCN-NEXT: v_mul_lo_u32 v8, v5, v2
-; GCN-NEXT: v_mul_lo_u32 v9, v4, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; GCN-NEXT: v_mul_hi_u32 v7, v2, v9
-; GCN-NEXT: v_mul_lo_u32 v8, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v10, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v11, v3, v6
-; GCN-NEXT: v_mul_lo_u32 v6, v3, v6
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GCN-NEXT: v_mul_hi_u32 v10, v5, v2
+; GCN-NEXT: v_mul_hi_u32 v11, v4, v2
+; GCN-NEXT: v_mul_lo_u32 v2, v4, v2
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc
-; GCN-NEXT: v_mul_lo_u32 v10, v3, v9
-; GCN-NEXT: v_mul_hi_u32 v9, v3, v9
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v9, vcc
+; GCN-NEXT: v_mul_lo_u32 v10, v4, v9
+; GCN-NEXT: v_mul_hi_u32 v9, v4, v9
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v10
+; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v9, vcc
; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v3, v2
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v8, vcc
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 32
+; GCN-NEXT: v_or_b32_e32 v2, v2, v9
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v2
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v3, vcc
+; GCN-NEXT: v_mul_lo_u32 v2, v6, v5
+; GCN-NEXT: v_mul_hi_u32 v3, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v6, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v7, v7, v5
+; GCN-NEXT: v_mul_hi_u32 v8, v4, v2
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GCN-NEXT: v_mul_lo_u32 v6, v5, v3
+; GCN-NEXT: v_mul_hi_u32 v7, v5, v2
+; GCN-NEXT: v_mul_hi_u32 v9, v5, v3
+; GCN-NEXT: v_mul_lo_u32 v2, v4, v2
; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GCN-NEXT: v_mul_lo_u32 v6, v4, v3
-; GCN-NEXT: v_mul_hi_u32 v7, v4, v2
-; GCN-NEXT: v_mul_lo_u32 v5, v5, v2
-; GCN-NEXT: v_mul_lo_u32 v4, v4, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT: v_mul_lo_u32 v8, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v9, v2, v4
-; GCN-NEXT: v_mul_hi_u32 v10, v2, v5
-; GCN-NEXT: v_mul_hi_u32 v7, v3, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v3, v4
-; GCN-NEXT: v_mul_hi_u32 v6, v3, v5
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT: v_mul_lo_u32 v5, v3, v5
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc
-; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v5, vcc
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc
+; GCN-NEXT: v_mul_hi_u32 v9, v4, v3
+; GCN-NEXT: v_mul_lo_u32 v3, v4, v3
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v8, vcc
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v2, v3
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 32
+; GCN-NEXT: v_or_b32_e32 v2, v2, v7
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v4, v3, vcc
; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2
; GCN-NEXT: v_mul_lo_u32 v3, v1, v2
; GCN-NEXT: v_mul_hi_u32 v4, v0, v2
; GCN-NEXT: v_mul_lo_u32 v2, v0, v2
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0x8000, v2
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v2
; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v0
; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 8cc7025d671c47..0b658b19807187 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -207,16 +207,17 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: v_usubsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v3
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_max_u32_e32 v1, v1, v3
+; GFX6-NEXT: v_max_u32_e32 v1, v1, v4
; GFX6-NEXT: v_max_u32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 66c49ba8b734db..92975bb83a8e25 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -4237,261 +4237,294 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace
define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) {
; GFX9-LABEL: fma_shuffle_v2bf16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: s_mov_b32 s3, 0x7060302
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[0:1]
-; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[0:1]
; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_fma_f32 v7, v8, v9, v7
-; GFX9-NEXT: v_fma_f32 v1, v8, v5, v1
-; GFX9-NEXT: v_fma_f32 v2, v12, v5, v2
-; GFX9-NEXT: v_bfe_u32 v5, v7, 16, 1
-; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v5, v5, v7, s2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v1
-; GFX9-NEXT: v_bfe_u32 v13, v8, 16, 1
-; GFX9-NEXT: v_add3_u32 v11, v11, v1, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v8
-; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1
-; GFX9-NEXT: v_add3_u32 v13, v13, v8, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2
-; GFX9-NEXT: v_add3_u32 v15, v15, v2, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_fma_f32 v3, v3, v6, v5
-; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT: v_fma_f32 v2, v4, v10, v2
-; GFX9-NEXT: v_fma_f32 v4, v4, v6, v7
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_fma_f32 v4, v11, v5, v4
+; GFX9-NEXT: v_fma_f32 v5, v7, v9, v8
; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX9-NEXT: v_fma_f32 v9, v11, v9, v12
+; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1
; GFX9-NEXT: v_add3_u32 v7, v7, v3, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX9-NEXT: v_add3_u32 v9, v9, v2, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
; GFX9-NEXT: v_add3_u32 v11, v11, v4, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v5
+; GFX9-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX9-NEXT: v_add3_u32 v13, v13, v5, s2
; GFX9-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc
-; GFX9-NEXT: v_perm_b32 v2, v4, v2, s3
-; GFX9-NEXT: v_perm_b32 v1, v3, v1, s3
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX9-NEXT: v_add3_u32 v7, v15, v9, s2
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v14, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v16, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_fma_f32 v3, v1, v10, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_fma_f32 v4, v2, v10, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX9-NEXT: v_fma_f32 v1, v1, v6, v5
+; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT: v_fma_f32 v2, v2, v6, v7
+; GFX9-NEXT: v_add3_u32 v6, v8, v3, s2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v6, v7, v1, s2
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX9-NEXT: v_add3_u32 v5, v11, v2, s2
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v12, vcc
+; GFX9-NEXT: v_perm_b32 v2, v2, v4, s3
+; GFX9-NEXT: v_perm_b32 v1, v1, v3, s3
; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: fma_shuffle_v2bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[10:11]
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v9
-; GFX10-NEXT: v_fmac_f32_e32 v0, v8, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v9
-; GFX10-NEXT: v_fmac_f32_e32 v1, v12, v4
-; GFX10-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v15, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v15, v15, v1, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_fmac_f32_e32 v2, v7, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_fmac_f32_e32 v3, v11, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT: v_fmac_f32_e32 v8, v7, v9
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX10-NEXT: v_fmac_f32_e32 v12, v11, v9
+; GFX10-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX10-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX10-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_bfe_u32 v15, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v13, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_add3_u32 v7, v15, v12, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX10-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT: v_fmac_f32_e32 v4, v2, v5
-; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v10
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v10
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v5
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v16, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_fmac_f32_e32 v3, v1, v10
+; GFX10-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v5
+; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_fmac_f32_e32 v7, v1, v5
+; GFX10-NEXT: v_add3_u32 v5, v8, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v8, v4, 16, 1
; GFX10-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX10-NEXT: v_add3_u32 v5, v11, v7, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fma_shuffle_v2bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3]
-; GFX11-NEXT: global_load_b64 v[2:3], v6, s[4:5]
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[0:1]
; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v3
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_fmac_f32_e32 v12, v11, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_fmac_f32 v11, v12, v9 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_fmac_f32_e32 v1, v12, v4
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v1
-; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4
-; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_bfe_u32 v15, v12, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_dual_fmac_f32 v8, v7, v9 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_fmac_f32 v3, v11, v4 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: v_bfe_u32 v13, v8, 16, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_fmac_f32_e32 v1, v3, v10
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX11-NEXT: v_fmac_f32_e32 v7, v3, v5
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-NEXT: v_fmac_f32_e32 v2, v7, v4
+; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_add3_u32 v4, v13, v8, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v15, v12, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v16, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_dual_fmac_f32 v4, v0, v5 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v7, v1, v5
; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_fmac_f32_e32 v0, v2, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v10
+; GFX11-NEXT: v_fmac_f32_e32 v3, v1, v10
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-NEXT: v_add3_u32 v5, v8, v2, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_add3_u32 v5, v11, v7, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v4, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
-; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302
-; GFX11-NEXT: global_store_b64 v6, v[0:1], s[2:3]
+; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
+; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 901e88a4c6aca8..2688b5f9b3651a 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -741,8 +741,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s5
-; GFX1032-NEXT: s_sub_u32 s9, 0, s4
-; GFX1032-NEXT: s_subb_u32 s10, 0, s5
+; GFX1032-NEXT: s_sub_u32 s10, 0, s4
+; GFX1032-NEXT: s_subb_u32 s11, 0, s5
; GFX1032-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX1032-NEXT: v_rcp_f32_e32 v0, v0
; GFX1032-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -751,111 +751,117 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1032-NEXT: v_readfirstlane_b32 s1, v0
-; GFX1032-NEXT: s_mul_i32 s11, s9, s0
-; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s1
-; GFX1032-NEXT: s_mul_i32 s12, s10, s1
-; GFX1032-NEXT: s_add_i32 s11, s13, s11
-; GFX1032-NEXT: s_mul_i32 s14, s9, s1
-; GFX1032-NEXT: s_add_i32 s11, s11, s12
-; GFX1032-NEXT: s_mul_hi_u32 s13, s1, s14
-; GFX1032-NEXT: s_mul_hi_u32 s15, s0, s14
-; GFX1032-NEXT: s_mul_i32 s12, s0, s14
-; GFX1032-NEXT: s_mul_hi_u32 s14, s1, s11
-; GFX1032-NEXT: s_mul_i32 s1, s1, s11
-; GFX1032-NEXT: s_mul_hi_u32 s16, s0, s11
-; GFX1032-NEXT: s_add_u32 s1, s13, s1
+; GFX1032-NEXT: v_readfirstlane_b32 s9, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mul_i32 s1, s10, s9
+; GFX1032-NEXT: s_mul_hi_u32 s13, s10, s0
+; GFX1032-NEXT: s_mul_i32 s12, s11, s0
+; GFX1032-NEXT: s_add_i32 s1, s13, s1
+; GFX1032-NEXT: s_mul_i32 s14, s10, s0
+; GFX1032-NEXT: s_add_i32 s1, s1, s12
+; GFX1032-NEXT: s_mul_hi_u32 s13, s0, s14
+; GFX1032-NEXT: s_mul_hi_u32 s15, s9, s14
+; GFX1032-NEXT: s_mul_i32 s12, s9, s14
+; GFX1032-NEXT: s_mul_hi_u32 s14, s0, s1
+; GFX1032-NEXT: s_mul_i32 s0, s0, s1
+; GFX1032-NEXT: s_mul_hi_u32 s16, s9, s1
+; GFX1032-NEXT: s_add_u32 s0, s13, s0
; GFX1032-NEXT: s_addc_u32 s13, 0, s14
-; GFX1032-NEXT: s_add_u32 s1, s1, s12
-; GFX1032-NEXT: s_mul_i32 s11, s0, s11
-; GFX1032-NEXT: s_addc_u32 s1, s13, s15
+; GFX1032-NEXT: s_add_u32 s0, s0, s12
+; GFX1032-NEXT: s_mul_i32 s1, s9, s1
+; GFX1032-NEXT: s_addc_u32 s0, s13, s15
; GFX1032-NEXT: s_addc_u32 s12, s16, 0
-; GFX1032-NEXT: s_add_u32 s1, s1, s11
+; GFX1032-NEXT: s_add_u32 s13, s0, s1
+; GFX1032-NEXT: s_addc_u32 s0, 0, s12
+; GFX1032-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1032-NEXT: s_or_b32 s0, s0, s13
+; GFX1032-NEXT: v_add_co_u32 v0, s0, v0, s0
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_addc_u32 s9, s9, s1
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mul_i32 s1, s10, s9
+; GFX1032-NEXT: s_mul_i32 s12, s10, s0
+; GFX1032-NEXT: s_mul_hi_u32 s10, s10, s0
+; GFX1032-NEXT: s_mul_i32 s11, s11, s0
+; GFX1032-NEXT: s_add_i32 s1, s10, s1
+; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s12
+; GFX1032-NEXT: s_add_i32 s1, s1, s11
+; GFX1032-NEXT: s_mul_i32 s10, s9, s12
+; GFX1032-NEXT: s_mul_hi_u32 s11, s0, s12
+; GFX1032-NEXT: s_mul_hi_u32 s12, s0, s1
+; GFX1032-NEXT: s_mul_i32 s0, s0, s1
+; GFX1032-NEXT: s_mul_hi_u32 s14, s9, s1
+; GFX1032-NEXT: s_add_u32 s0, s11, s0
; GFX1032-NEXT: s_addc_u32 s11, 0, s12
-; GFX1032-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1032-NEXT: s_addc_u32 s0, s0, s11
-; GFX1032-NEXT: v_readfirstlane_b32 s1, v0
-; GFX1032-NEXT: s_mul_i32 s11, s9, s0
-; GFX1032-NEXT: s_mul_hi_u32 s12, s9, s1
-; GFX1032-NEXT: s_mul_i32 s10, s10, s1
-; GFX1032-NEXT: s_add_i32 s11, s12, s11
-; GFX1032-NEXT: s_mul_i32 s9, s9, s1
-; GFX1032-NEXT: s_add_i32 s11, s11, s10
-; GFX1032-NEXT: s_mul_hi_u32 s12, s0, s9
-; GFX1032-NEXT: s_mul_i32 s13, s0, s9
-; GFX1032-NEXT: s_mul_hi_u32 s9, s1, s9
-; GFX1032-NEXT: s_mul_hi_u32 s14, s1, s11
-; GFX1032-NEXT: s_mul_i32 s1, s1, s11
-; GFX1032-NEXT: s_mul_hi_u32 s10, s0, s11
-; GFX1032-NEXT: s_add_u32 s1, s9, s1
-; GFX1032-NEXT: s_addc_u32 s9, 0, s14
-; GFX1032-NEXT: s_add_u32 s1, s1, s13
-; GFX1032-NEXT: s_mul_i32 s11, s0, s11
-; GFX1032-NEXT: s_addc_u32 s1, s9, s12
-; GFX1032-NEXT: s_addc_u32 s9, s10, 0
-; GFX1032-NEXT: s_add_u32 s1, s1, s11
-; GFX1032-NEXT: s_addc_u32 s9, 0, s9
-; GFX1032-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1032-NEXT: s_addc_u32 s0, s0, s9
+; GFX1032-NEXT: s_add_u32 s0, s0, s10
+; GFX1032-NEXT: s_mul_i32 s1, s9, s1
+; GFX1032-NEXT: s_addc_u32 s0, s11, s13
+; GFX1032-NEXT: s_addc_u32 s10, s14, 0
+; GFX1032-NEXT: s_add_u32 s11, s0, s1
+; GFX1032-NEXT: s_addc_u32 s0, 0, s10
+; GFX1032-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1032-NEXT: s_or_b32 s0, s0, s11
+; GFX1032-NEXT: v_add_co_u32 v0, s0, v0, s0
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_addc_u32 s0, s9, s1
; GFX1032-NEXT: v_readfirstlane_b32 s1, v0
; GFX1032-NEXT: s_mul_i32 s10, s6, s0
; GFX1032-NEXT: s_mul_hi_u32 s9, s6, s0
; GFX1032-NEXT: s_mul_hi_u32 s11, s7, s0
; GFX1032-NEXT: s_mul_i32 s0, s7, s0
-; GFX1032-NEXT: s_mul_hi_u32 s12, s6, s1
-; GFX1032-NEXT: s_mul_hi_u32 s13, s7, s1
+; GFX1032-NEXT: s_mul_hi_u32 s13, s6, s1
+; GFX1032-NEXT: s_mul_hi_u32 s12, s7, s1
; GFX1032-NEXT: s_mul_i32 s1, s7, s1
-; GFX1032-NEXT: s_add_u32 s10, s12, s10
+; GFX1032-NEXT: s_add_u32 s10, s13, s10
; GFX1032-NEXT: s_addc_u32 s9, 0, s9
; GFX1032-NEXT: s_add_u32 s1, s10, s1
-; GFX1032-NEXT: s_addc_u32 s1, s9, s13
+; GFX1032-NEXT: s_addc_u32 s1, s9, s12
; GFX1032-NEXT: s_addc_u32 s9, s11, 0
-; GFX1032-NEXT: s_add_u32 s1, s1, s0
-; GFX1032-NEXT: s_addc_u32 s9, 0, s9
-; GFX1032-NEXT: s_mul_hi_u32 s0, s4, s1
-; GFX1032-NEXT: s_mul_i32 s11, s4, s9
+; GFX1032-NEXT: s_add_u32 s10, s1, s0
+; GFX1032-NEXT: s_addc_u32 s0, 0, s9
+; GFX1032-NEXT: s_mul_hi_u32 s9, s4, s10
+; GFX1032-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1032-NEXT: s_mul_i32 s13, s4, s10
; GFX1032-NEXT: s_mul_i32 s12, s4, s1
-; GFX1032-NEXT: s_add_i32 s0, s0, s11
-; GFX1032-NEXT: v_sub_co_u32 v0, s11, s6, s12
-; GFX1032-NEXT: s_mul_i32 s10, s5, s1
-; GFX1032-NEXT: s_add_i32 s0, s0, s10
-; GFX1032-NEXT: v_sub_co_u32 v1, s12, v0, s4
-; GFX1032-NEXT: s_sub_i32 s10, s7, s0
-; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1032-NEXT: s_subb_u32 s10, s10, s5
+; GFX1032-NEXT: s_mul_i32 s11, s5, s10
+; GFX1032-NEXT: s_add_i32 s9, s9, s12
+; GFX1032-NEXT: v_sub_co_u32 v0, s12, s6, s13
+; GFX1032-NEXT: s_add_i32 s9, s9, s11
+; GFX1032-NEXT: s_sub_i32 s11, s7, s9
+; GFX1032-NEXT: v_sub_co_u32 v1, s13, v0, s4
; GFX1032-NEXT: s_cmp_lg_u32 s12, 0
+; GFX1032-NEXT: s_subb_u32 s11, s11, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s13, 0
; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1
-; GFX1032-NEXT: s_subb_u32 s10, s10, 0
-; GFX1032-NEXT: s_cmp_ge_u32 s10, s5
+; GFX1032-NEXT: s_subb_u32 s11, s11, 0
+; GFX1032-NEXT: s_cmp_ge_u32 s11, s5
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1032-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1032-NEXT: s_cmp_eq_u32 s10, s5
+; GFX1032-NEXT: s_cselect_b32 s13, -1, 0
+; GFX1032-NEXT: s_cmp_eq_u32 s11, s5
; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX1032-NEXT: s_add_u32 s10, s1, 1
-; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
-; GFX1032-NEXT: s_addc_u32 s12, s9, 0
-; GFX1032-NEXT: s_add_u32 s13, s1, 2
-; GFX1032-NEXT: s_addc_u32 s14, s9, 0
-; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1032-NEXT: s_or_b32 s10, s0, s10
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, s13, v1, vcc_lo
+; GFX1032-NEXT: s_add_u32 s11, s10, 1
+; GFX1032-NEXT: s_addc_u32 s13, s1, 0
+; GFX1032-NEXT: s_add_u32 s0, s10, 2
+; GFX1032-NEXT: s_addc_u32 s14, s1, 0
+; GFX1032-NEXT: s_cmp_lg_u32 s12, 0
; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v0
-; GFX1032-NEXT: s_subb_u32 s0, s7, s0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s13
-; GFX1032-NEXT: s_cmp_ge_u32 s0, s5
+; GFX1032-NEXT: s_subb_u32 s7, s7, s9
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
+; GFX1032-NEXT: s_cmp_ge_u32 s7, s5
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX1032-NEXT: s_cselect_b32 s7, -1, 0
-; GFX1032-NEXT: s_cmp_eq_u32 s0, s5
+; GFX1032-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1032-NEXT: s_cmp_eq_u32 s7, s5
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
; GFX1032-NEXT: v_mov_b32_e32 v1, s14
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0
-; GFX1032-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, s9, v0, s0
+; GFX1032-NEXT: v_cndmask_b32_e32 v2, s11, v2, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, s13, v1, vcc_lo
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, s1, v1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc_lo
; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX1032-NEXT: s_cbranch_vccnz .LBB15_3
; GFX1032-NEXT: .LBB15_2:
@@ -903,8 +909,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s5
-; GFX1064-NEXT: s_sub_u32 s9, 0, s4
-; GFX1064-NEXT: s_subb_u32 s10, 0, s5
+; GFX1064-NEXT: s_sub_u32 s11, 0, s4
+; GFX1064-NEXT: s_subb_u32 s12, 0, s5
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX1064-NEXT: v_rcp_f32_e32 v0, v0
; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -913,98 +919,104 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s8, v1
-; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT: s_mul_i32 s1, s9, s8
-; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s0
-; GFX1064-NEXT: s_mul_i32 s11, s10, s0
-; GFX1064-NEXT: s_add_i32 s1, s12, s1
-; GFX1064-NEXT: s_mul_i32 s13, s9, s0
-; GFX1064-NEXT: s_add_i32 s1, s1, s11
-; GFX1064-NEXT: s_mul_hi_u32 s12, s0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13
-; GFX1064-NEXT: s_mul_i32 s11, s8, s13
-; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1
-; GFX1064-NEXT: s_mul_i32 s0, s0, s1
-; GFX1064-NEXT: s_mul_hi_u32 s15, s8, s1
-; GFX1064-NEXT: s_add_u32 s0, s12, s0
-; GFX1064-NEXT: s_addc_u32 s12, 0, s13
-; GFX1064-NEXT: s_add_u32 s0, s0, s11
-; GFX1064-NEXT: s_mul_i32 s1, s8, s1
-; GFX1064-NEXT: s_addc_u32 s0, s12, s14
-; GFX1064-NEXT: s_addc_u32 s11, s15, 0
-; GFX1064-NEXT: s_add_u32 s0, s0, s1
-; GFX1064-NEXT: s_addc_u32 s11, 0, s11
-; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: s_addc_u32 s8, s8, s11
+; GFX1064-NEXT: v_readfirstlane_b32 s10, v1
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT: s_mul_i32 s1, s9, s8
-; GFX1064-NEXT: s_mul_hi_u32 s11, s9, s0
-; GFX1064-NEXT: s_mul_i32 s10, s10, s0
-; GFX1064-NEXT: s_add_i32 s1, s11, s1
-; GFX1064-NEXT: s_mul_i32 s9, s9, s0
-; GFX1064-NEXT: s_add_i32 s1, s1, s10
-; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s9
-; GFX1064-NEXT: s_mul_i32 s12, s8, s9
-; GFX1064-NEXT: s_mul_hi_u32 s9, s0, s9
+; GFX1064-NEXT: s_mul_i32 s1, s11, s10
+; GFX1064-NEXT: s_mul_hi_u32 s9, s11, s0
+; GFX1064-NEXT: s_mul_i32 s8, s12, s0
+; GFX1064-NEXT: s_add_i32 s1, s9, s1
+; GFX1064-NEXT: s_mul_i32 s13, s11, s0
+; GFX1064-NEXT: s_add_i32 s1, s1, s8
+; GFX1064-NEXT: s_mul_hi_u32 s9, s0, s13
+; GFX1064-NEXT: s_mul_hi_u32 s14, s10, s13
+; GFX1064-NEXT: s_mul_i32 s8, s10, s13
; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1
; GFX1064-NEXT: s_mul_i32 s0, s0, s1
-; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s1
+; GFX1064-NEXT: s_mul_hi_u32 s15, s10, s1
; GFX1064-NEXT: s_add_u32 s0, s9, s0
; GFX1064-NEXT: s_addc_u32 s9, 0, s13
-; GFX1064-NEXT: s_add_u32 s0, s0, s12
-; GFX1064-NEXT: s_mul_i32 s1, s8, s1
-; GFX1064-NEXT: s_addc_u32 s0, s9, s11
-; GFX1064-NEXT: s_addc_u32 s9, s10, 0
-; GFX1064-NEXT: s_add_u32 s0, s0, s1
-; GFX1064-NEXT: s_addc_u32 s9, 0, s9
-; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: s_addc_u32 s0, s8, s9
+; GFX1064-NEXT: s_add_u32 s0, s0, s8
+; GFX1064-NEXT: s_mul_i32 s1, s10, s1
+; GFX1064-NEXT: s_addc_u32 s0, s9, s14
+; GFX1064-NEXT: s_addc_u32 s8, s15, 0
+; GFX1064-NEXT: s_add_u32 s9, s0, s1
+; GFX1064-NEXT: s_addc_u32 s0, 0, s8
+; GFX1064-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1064-NEXT: s_or_b32 s0, s0, s9
+; GFX1064-NEXT: v_add_co_u32 v0, s[8:9], v0, s0
+; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1064-NEXT: s_addc_u32 s10, s10, s1
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mul_i32 s1, s11, s10
+; GFX1064-NEXT: s_mul_hi_u32 s9, s11, s0
+; GFX1064-NEXT: s_mul_i32 s12, s12, s0
+; GFX1064-NEXT: s_add_i32 s1, s9, s1
+; GFX1064-NEXT: s_mul_i32 s8, s11, s0
+; GFX1064-NEXT: s_add_i32 s1, s1, s12
+; GFX1064-NEXT: s_mul_hi_u32 s11, s10, s8
+; GFX1064-NEXT: s_mul_i32 s9, s10, s8
+; GFX1064-NEXT: s_mul_hi_u32 s8, s0, s8
+; GFX1064-NEXT: s_mul_hi_u32 s12, s0, s1
+; GFX1064-NEXT: s_mul_i32 s0, s0, s1
+; GFX1064-NEXT: s_mul_hi_u32 s13, s10, s1
+; GFX1064-NEXT: s_add_u32 s0, s8, s0
+; GFX1064-NEXT: s_addc_u32 s8, 0, s12
+; GFX1064-NEXT: s_add_u32 s0, s0, s9
+; GFX1064-NEXT: s_mul_i32 s1, s10, s1
+; GFX1064-NEXT: s_addc_u32 s0, s8, s11
+; GFX1064-NEXT: s_addc_u32 s8, s13, 0
+; GFX1064-NEXT: s_add_u32 s9, s0, s1
+; GFX1064-NEXT: s_addc_u32 s0, 0, s8
+; GFX1064-NEXT: s_lshl_b64 s[0:1], s[0:1], 32
+; GFX1064-NEXT: s_or_b32 s0, s0, s9
+; GFX1064-NEXT: v_add_co_u32 v0, s[8:9], v0, s0
+; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1064-NEXT: s_addc_u32 s0, s10, s1
; GFX1064-NEXT: v_readfirstlane_b32 s1, v0
; GFX1064-NEXT: s_mul_i32 s9, s6, s0
; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s0
; GFX1064-NEXT: s_mul_hi_u32 s10, s7, s0
; GFX1064-NEXT: s_mul_i32 s0, s7, s0
-; GFX1064-NEXT: s_mul_hi_u32 s11, s6, s1
-; GFX1064-NEXT: s_mul_hi_u32 s12, s7, s1
+; GFX1064-NEXT: s_mul_hi_u32 s12, s6, s1
+; GFX1064-NEXT: s_mul_hi_u32 s11, s7, s1
; GFX1064-NEXT: s_mul_i32 s1, s7, s1
-; GFX1064-NEXT: s_add_u32 s9, s11, s9
+; GFX1064-NEXT: s_add_u32 s9, s12, s9
; GFX1064-NEXT: s_addc_u32 s8, 0, s8
; GFX1064-NEXT: s_add_u32 s1, s9, s1
-; GFX1064-NEXT: s_addc_u32 s1, s8, s12
+; GFX1064-NEXT: s_addc_u32 s1, s8, s11
; GFX1064-NEXT: s_addc_u32 s8, s10, 0
-; GFX1064-NEXT: s_add_u32 s10, s1, s0
-; GFX1064-NEXT: s_addc_u32 s11, 0, s8
-; GFX1064-NEXT: s_mul_hi_u32 s0, s4, s10
-; GFX1064-NEXT: s_mul_i32 s1, s4, s11
-; GFX1064-NEXT: s_mul_i32 s9, s4, s10
-; GFX1064-NEXT: s_add_i32 s12, s0, s1
-; GFX1064-NEXT: v_sub_co_u32 v0, s[0:1], s6, s9
-; GFX1064-NEXT: s_mul_i32 s8, s5, s10
-; GFX1064-NEXT: s_add_i32 s12, s12, s8
-; GFX1064-NEXT: v_sub_co_u32 v1, s[8:9], v0, s4
-; GFX1064-NEXT: s_sub_i32 s13, s7, s12
+; GFX1064-NEXT: s_add_u32 s12, s1, s0
+; GFX1064-NEXT: s_addc_u32 s0, 0, s8
+; GFX1064-NEXT: s_mul_hi_u32 s1, s4, s12
+; GFX1064-NEXT: s_mul_i32 s11, s4, s12
+; GFX1064-NEXT: s_lshl_b64 s[8:9], s[0:1], 32
+; GFX1064-NEXT: s_mul_i32 s10, s5, s12
+; GFX1064-NEXT: s_mul_i32 s0, s4, s9
+; GFX1064-NEXT: s_add_i32 s13, s1, s0
+; GFX1064-NEXT: v_sub_co_u32 v0, s[0:1], s6, s11
+; GFX1064-NEXT: s_add_i32 s13, s13, s10
+; GFX1064-NEXT: s_sub_i32 s14, s7, s13
+; GFX1064-NEXT: v_sub_co_u32 v1, s[10:11], v0, s4
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: s_subb_u32 s13, s13, s5
-; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1064-NEXT: s_subb_u32 s14, s14, s5
+; GFX1064-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
-; GFX1064-NEXT: s_subb_u32 s8, s13, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s8, s5
+; GFX1064-NEXT: s_subb_u32 s10, s14, 0
+; GFX1064-NEXT: s_cmp_ge_u32 s10, s5
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX1064-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1064-NEXT: s_cmp_eq_u32 s8, s5
+; GFX1064-NEXT: s_cselect_b32 s11, -1, 0
+; GFX1064-NEXT: s_cmp_eq_u32 s10, s5
; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX1064-NEXT: s_add_u32 s8, s10, 1
-; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc
-; GFX1064-NEXT: s_addc_u32 s9, s11, 0
-; GFX1064-NEXT: s_add_u32 s13, s10, 2
-; GFX1064-NEXT: s_addc_u32 s14, s11, 0
+; GFX1064-NEXT: s_or_b32 s8, s8, s12
+; GFX1064-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc
+; GFX1064-NEXT: s_add_u32 s10, s8, 1
+; GFX1064-NEXT: s_addc_u32 s11, s9, 0
+; GFX1064-NEXT: s_add_u32 s12, s8, 2
+; GFX1064-NEXT: s_addc_u32 s14, s9, 0
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
-; GFX1064-NEXT: s_subb_u32 s0, s7, s12
-; GFX1064-NEXT: v_mov_b32_e32 v2, s13
+; GFX1064-NEXT: s_subb_u32 s0, s7, s13
+; GFX1064-NEXT: v_mov_b32_e32 v2, s12
; GFX1064-NEXT: s_cmp_ge_u32 s0, s5
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GFX1064-NEXT: s_cselect_b32 s7, -1, 0
@@ -1013,11 +1025,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX1064-NEXT: v_mov_b32_e32 v1, s14
; GFX1064-NEXT: v_cndmask_b32_e64 v0, s7, v0, s[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc
-; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc
-; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc
; GFX1064-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc
-; GFX1064-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc
+; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e32 v0, s8, v2, vcc
; GFX1064-NEXT: s_cbranch_execnz .LBB15_3
; GFX1064-NEXT: .LBB15_2:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index e0b320aa4f3727..f3a5f406739eae 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -303,13 +303,15 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
; SI-NEXT: s_load_dword s1, s[0:1], 0x0
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s4, s1, 0xff00
-; SI-NEXT: s_add_i32 s1, s1, 12
-; SI-NEXT: s_or_b32 s1, s1, 4
-; SI-NEXT: s_and_b32 s1, s1, 0xff
-; SI-NEXT: s_or_b32 s1, s4, s1
-; SI-NEXT: s_addk_i32 s1, 0x2c00
-; SI-NEXT: s_or_b32 s4, s1, 0x300
+; SI-NEXT: s_and_b32 s4, s1, 0xffff
+; SI-NEXT: s_bfe_u32 s1, s1, 0x80008
+; SI-NEXT: s_add_i32 s4, s4, 12
+; SI-NEXT: s_add_i32 s1, s1, 44
+; SI-NEXT: s_or_b32 s1, s1, 3
+; SI-NEXT: s_or_b32 s4, s4, 4
+; SI-NEXT: s_lshl_b32 s1, s1, 8
+; SI-NEXT: s_and_b32 s4, s4, 0xff
+; SI-NEXT: s_or_b32 s4, s4, s1
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -343,16 +345,17 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_nc_u16 v0, s0, 12
-; GFX11-NEXT: v_and_b32_e64 v1, 0xffffff00, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, 4, v0
+; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x2c00
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_or_b32_e32 v2, 0x300, v2
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/ARM/and-cmpz.ll b/llvm/test/CodeGen/ARM/and-cmpz.ll
index e1c9fe52911b99..f0cd021331ab73 100644
--- a/llvm/test/CodeGen/ARM/and-cmpz.ll
+++ b/llvm/test/CodeGen/ARM/and-cmpz.ll
@@ -193,7 +193,7 @@ false:
define void @i16_cmpz(i16 %x, ptr %foo) {
; T2-LABEL: i16_cmpz:
; T2: @ %bb.0: @ %entry
-; T2-NEXT: uxth r0, r0
+; T2-NEXT: and r0, r0, #65024
; T2-NEXT: movs r2, #0
; T2-NEXT: cmp.w r2, r0, lsr #9
; T2-NEXT: it ne
@@ -205,8 +205,10 @@ define void @i16_cmpz(i16 %x, ptr %foo) {
; T1-LABEL: i16_cmpz:
; T1: @ %bb.0: @ %entry
; T1-NEXT: push {r7, lr}
-; T1-NEXT: uxth r0, r0
-; T1-NEXT: lsrs r0, r0, #9
+; T1-NEXT: movs r2, #127
+; T1-NEXT: lsls r2, r2, #9
+; T1-NEXT: ands r2, r0
+; T1-NEXT: lsrs r0, r2, #9
; T1-NEXT: bne .LBB5_2
; T1-NEXT: @ %bb.1: @ %if.then
; T1-NEXT: movs r0, #0
diff --git a/llvm/test/CodeGen/ARM/and-load-combine.ll b/llvm/test/CodeGen/ARM/and-load-combine.ll
index dfb71483851da9..e120e0c5d72047 100644
--- a/llvm/test/CodeGen/ARM/and-load-combine.ll
+++ b/llvm/test/CodeGen/ARM/and-load-combine.ll
@@ -1553,28 +1553,32 @@ define void @test27(ptr nocapture %ptr) {
; ARM-LABEL: test27:
; ARM: @ %bb.0: @ %entry
; ARM-NEXT: ldrb r1, [r0, #1]
-; ARM-NEXT: lsl r1, r1, #16
+; ARM-NEXT: lsl r1, r1, #8
+; ARM-NEXT: lsl r1, r1, #8
; ARM-NEXT: str r1, [r0]
; ARM-NEXT: bx lr
;
; ARMEB-LABEL: test27:
; ARMEB: @ %bb.0: @ %entry
; ARMEB-NEXT: ldrb r1, [r0, #2]
-; ARMEB-NEXT: lsl r1, r1, #16
+; ARMEB-NEXT: lsl r1, r1, #8
+; ARMEB-NEXT: lsl r1, r1, #8
; ARMEB-NEXT: str r1, [r0]
; ARMEB-NEXT: bx lr
;
; THUMB1-LABEL: test27:
; THUMB1: @ %bb.0: @ %entry
; THUMB1-NEXT: ldrb r1, [r0, #1]
-; THUMB1-NEXT: lsls r1, r1, #16
+; THUMB1-NEXT: lsls r1, r1, #8
+; THUMB1-NEXT: lsls r1, r1, #8
; THUMB1-NEXT: str r1, [r0]
; THUMB1-NEXT: bx lr
;
; THUMB2-LABEL: test27:
; THUMB2: @ %bb.0: @ %entry
; THUMB2-NEXT: ldrb r1, [r0, #1]
-; THUMB2-NEXT: lsls r1, r1, #16
+; THUMB2-NEXT: lsls r1, r1, #8
+; THUMB2-NEXT: lsls r1, r1, #8
; THUMB2-NEXT: str r1, [r0]
; THUMB2-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/ARM/bfi-chain-cse-crash.ll b/llvm/test/CodeGen/ARM/bfi-chain-cse-crash.ll
index 6424f7b665ed92..c6f0dff4adea20 100644
--- a/llvm/test/CodeGen/ARM/bfi-chain-cse-crash.ll
+++ b/llvm/test/CodeGen/ARM/bfi-chain-cse-crash.ll
@@ -7,12 +7,16 @@ define void @bfi_chain_cse_crash(ptr %0, ptr %ptr) {
; CHECK-LABEL: bfi_chain_cse_crash:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrb r2, [r0]
-; CHECK-NEXT: and r3, r2, #1
-; CHECK-NEXT: lsr.w r12, r2, #3
-; CHECK-NEXT: bfi r3, r12, #3, #1
-; CHECK-NEXT: strb r3, [r0]
-; CHECK-NEXT: and r0, r2, #4
-; CHECK-NEXT: bfi r0, r12, #3, #1
+; CHECK-NEXT: and r12, r2, #1
+; CHECK-NEXT: lsrs r3, r2, #3
+; CHECK-NEXT: bfi r12, r3, #3, #1
+; CHECK-NEXT: strb.w r12, [r0]
+; CHECK-NEXT: ubfx r0, r2, #2, #1
+; CHECK-NEXT: lsls r3, r2, #28
+; CHECK-NEXT: lsl.w r0, r0, #2
+; CHECK-NEXT: add.w r0, r0, #8
+; CHECK-NEXT: it pl
+; CHECK-NEXT: andpl r0, r2, #4
; CHECK-NEXT: strb r0, [r1]
; CHECK-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/ARM/bfi.ll b/llvm/test/CodeGen/ARM/bfi.ll
index 91a74e535a2218..bd036bc9abf1a4 100644
--- a/llvm/test/CodeGen/ARM/bfi.ll
+++ b/llvm/test/CodeGen/ARM/bfi.ll
@@ -68,6 +68,7 @@ define i32 @f4(i32 %a) nounwind {
define i32 @f5(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: f5:
; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ubfx r1, r1, #0, #12
; CHECK-NEXT: bfi r0, r1, #20, #4
; CHECK-NEXT: bx lr
entry:
@@ -82,6 +83,7 @@ entry:
define i32 @f6(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: f6:
; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ubfx r1, r1, #0, #24
; CHECK-NEXT: bfi r0, r1, #8, #9
; CHECK-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/ARM/bfx.ll b/llvm/test/CodeGen/ARM/bfx.ll
index fdde6be286b2bd..57bc0989798a8a 100644
--- a/llvm/test/CodeGen/ARM/bfx.ll
+++ b/llvm/test/CodeGen/ARM/bfx.ll
@@ -4,7 +4,8 @@
define i32 @sbfx1(i32 %a) {
; CHECK-LABEL: sbfx1:
; CHECK: @ %bb.0:
-; CHECK-NEXT: sbfx r0, r0, #7, #11
+; CHECK-NEXT: lsl r0, r0, #14
+; CHECK-NEXT: sbfx r0, r0, #21, #11
; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = trunc i32 %t1 to i11
diff --git a/llvm/test/CodeGen/ARM/combine-movc-sub.ll b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
index ca5d089443542c..75372aff8b6381 100644
--- a/llvm/test/CodeGen/ARM/combine-movc-sub.ll
+++ b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
@@ -31,8 +31,9 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
; CHECK-NEXT: mov r8, r0
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: mov r4, r2
-; CHECK-NEXT: add.w r6, r0, r7, lsr #5
+; CHECK-NEXT: add.w r0, r0, r7, lsr #5
; CHECK-NEXT: mov r5, r1
+; CHECK-NEXT: add.w r6, r0, #10
; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_1: @ %for.inc
@@ -44,8 +45,7 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
; CHECK-NEXT: mov r2, r4
; CHECK-NEXT: cmp r4, #31
; CHECK-NEXT: ldr r0, [r1, #16]
-; CHECK-NEXT: add.w r0, r0, r6, lsl #2
-; CHECK-NEXT: ldr r0, [r0, #40]
+; CHECK-NEXT: ldr.w r0, [r0, r6, lsl #2]
; CHECK-NEXT: it hi
; CHECK-NEXT: andhi r2, r7, #31
; CHECK-NEXT: lsrs r0, r2
diff --git a/llvm/test/CodeGen/ARM/demanded-bits-and.ll b/llvm/test/CodeGen/ARM/demanded-bits-and.ll
index ddfff681f30f67..9552ae41602539 100644
--- a/llvm/test/CodeGen/ARM/demanded-bits-and.ll
+++ b/llvm/test/CodeGen/ARM/demanded-bits-and.ll
@@ -15,6 +15,7 @@ define dso_local void @f(ptr %p) {
; CHECK-NEXT: add r2, r3, r2
; CHECK-NEXT: add r1, r2, r1, lsr #8
; CHECK-NEXT: add r1, r1, #2
+; CHECK-NEXT: bic r1, r1, #3
; CHECK-NEXT: lsr r1, r1, #2
; CHECK-NEXT: strh r1, [r0]
; CHECK-NEXT: b .LBB0_1
diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index 7cc623fb0a616a..3fbab15d0f3b25 100644
--- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -862,21 +862,36 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
;------------------------------------------------------------------------------;
define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
-; ARM-LABEL: scalar_i8_signbit_ne:
-; ARM: @ %bb.0:
-; ARM-NEXT: uxtb r1, r1
-; ARM-NEXT: lsl r0, r0, r1
-; ARM-NEXT: uxtb r0, r0
-; ARM-NEXT: lsr r0, r0, #7
-; ARM-NEXT: bx lr
+; ARM6-LABEL: scalar_i8_signbit_ne:
+; ARM6: @ %bb.0:
+; ARM6-NEXT: uxtb r1, r1
+; ARM6-NEXT: mov r2, #128
+; ARM6-NEXT: and r0, r2, r0, lsl r1
+; ARM6-NEXT: lsr r0, r0, #7
+; ARM6-NEXT: bx lr
+;
+; ARM78-LABEL: scalar_i8_signbit_ne:
+; ARM78: @ %bb.0:
+; ARM78-NEXT: uxtb r1, r1
+; ARM78-NEXT: lsl r0, r0, r1
+; ARM78-NEXT: ubfx r0, r0, #7, #1
+; ARM78-NEXT: bx lr
+;
+; THUMB6-LABEL: scalar_i8_signbit_ne:
+; THUMB6: @ %bb.0:
+; THUMB6-NEXT: uxtb r1, r1
+; THUMB6-NEXT: lsls r0, r1
+; THUMB6-NEXT: movs r1, #128
+; THUMB6-NEXT: ands r1, r0
+; THUMB6-NEXT: lsrs r0, r1, #7
+; THUMB6-NEXT: bx lr
;
-; THUMB-LABEL: scalar_i8_signbit_ne:
-; THUMB: @ %bb.0:
-; THUMB-NEXT: uxtb r1, r1
-; THUMB-NEXT: lsls r0, r1
-; THUMB-NEXT: uxtb r0, r0
-; THUMB-NEXT: lsrs r0, r0, #7
-; THUMB-NEXT: bx lr
+; THUMB78-LABEL: scalar_i8_signbit_ne:
+; THUMB78: @ %bb.0:
+; THUMB78-NEXT: uxtb r1, r1
+; THUMB78-NEXT: lsls r0, r1
+; THUMB78-NEXT: ubfx r0, r0, #7, #1
+; THUMB78-NEXT: bx lr
%t0 = lshr i8 128, %y
%t1 = and i8 %t0, %x
%res = icmp ne i8 %t1, 0 ; we are perfectly happy with 'ne' predicate
@@ -1051,3 +1066,5 @@ define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind {
%res = icmp eq i8 %t1, 1 ; should be comparing with 0
ret i1 %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; THUMB: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index a8421ae9a6a89f..31ad6ac1a03c2e 100644
--- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -874,21 +874,40 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
;------------------------------------------------------------------------------;
define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
-; ARM-LABEL: scalar_i8_signbit_ne:
-; ARM: @ %bb.0:
-; ARM-NEXT: uxtb r1, r1
-; ARM-NEXT: uxtb r0, r0
-; ARM-NEXT: lsr r0, r0, r1
-; ARM-NEXT: lsr r0, r0, #7
-; ARM-NEXT: bx lr
+; ARM6-LABEL: scalar_i8_signbit_ne:
+; ARM6: @ %bb.0:
+; ARM6-NEXT: uxtb r1, r1
+; ARM6-NEXT: uxtb r0, r0
+; ARM6-NEXT: mov r2, #128
+; ARM6-NEXT: and r0, r2, r0, lsr r1
+; ARM6-NEXT: lsr r0, r0, #7
+; ARM6-NEXT: bx lr
+;
+; ARM78-LABEL: scalar_i8_signbit_ne:
+; ARM78: @ %bb.0:
+; ARM78-NEXT: uxtb r1, r1
+; ARM78-NEXT: uxtb r0, r0
+; ARM78-NEXT: lsr r0, r0, r1
+; ARM78-NEXT: ubfx r0, r0, #7, #1
+; ARM78-NEXT: bx lr
;
-; THUMB-LABEL: scalar_i8_signbit_ne:
-; THUMB: @ %bb.0:
-; THUMB-NEXT: uxtb r1, r1
-; THUMB-NEXT: uxtb r0, r0
-; THUMB-NEXT: lsrs r0, r1
-; THUMB-NEXT: lsrs r0, r0, #7
-; THUMB-NEXT: bx lr
+; THUMB6-LABEL: scalar_i8_signbit_ne:
+; THUMB6: @ %bb.0:
+; THUMB6-NEXT: uxtb r1, r1
+; THUMB6-NEXT: uxtb r0, r0
+; THUMB6-NEXT: lsrs r0, r1
+; THUMB6-NEXT: movs r1, #128
+; THUMB6-NEXT: ands r1, r0
+; THUMB6-NEXT: lsrs r0, r1, #7
+; THUMB6-NEXT: bx lr
+;
+; THUMB78-LABEL: scalar_i8_signbit_ne:
+; THUMB78: @ %bb.0:
+; THUMB78-NEXT: uxtb r1, r1
+; THUMB78-NEXT: uxtb r0, r0
+; THUMB78-NEXT: lsrs r0, r1
+; THUMB78-NEXT: ubfx r0, r0, #7, #1
+; THUMB78-NEXT: bx lr
%t0 = shl i8 128, %y
%t1 = and i8 %t0, %x
%res = icmp ne i8 %t1, 0 ; we are perfectly happy with 'ne' predicate
@@ -1089,3 +1108,5 @@ define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind {
%res = icmp eq i8 %t1, 1 ; should be comparing with 0
ret i1 %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; THUMB: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
index 5dbf8dd86b8916..9533ef14065bbd 100644
--- a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
@@ -38,9 +38,11 @@ define void @i24_and_or(ptr %a) {
; BE-LABEL: i24_and_or:
; BE: @ %bb.0:
; BE-NEXT: mov r1, #128
+; BE-NEXT: mov r2, #256
; BE-NEXT: strb r1, [r0, #2]
; BE-NEXT: ldrh r1, [r0]
-; BE-NEXT: orr r1, r1, #1
+; BE-NEXT: orr r1, r2, r1, lsl #8
+; BE-NEXT: lsr r1, r1, #8
; BE-NEXT: strh r1, [r0]
; BE-NEXT: mov pc, lr
%b = load i24, ptr %a, align 1
@@ -90,16 +92,20 @@ define void @i56_or(ptr %a) {
;
; BE-LABEL: i56_or:
; BE: @ %bb.0:
-; BE-NEXT: mov r1, r0
-; BE-NEXT: ldr r0, [r0]
-; BE-NEXT: ldrh r2, [r1, #4]!
-; BE-NEXT: ldrb r3, [r1, #2]
+; BE-NEXT: ldr r1, [r0]
+; BE-NEXT: lsr r3, r1, #8
+; BE-NEXT: lsl r2, r1, #24
+; BE-NEXT: lsl r3, r3, #8
+; BE-NEXT: orr r2, r3, r2, lsr #24
+; BE-NEXT: str r2, [r0]
+; BE-NEXT: ldrh r2, [r0, #4]!
+; BE-NEXT: ldrb r3, [r0, #2]
; BE-NEXT: orr r2, r3, r2, lsl #8
-; BE-NEXT: orr r0, r2, r0, lsl #24
-; BE-NEXT: orr r0, r0, #384
-; BE-NEXT: strb r0, [r1, #2]
-; BE-NEXT: lsr r0, r0, #8
-; BE-NEXT: strh r0, [r1]
+; BE-NEXT: orr r1, r2, r1, lsl #24
+; BE-NEXT: orr r1, r1, #384
+; BE-NEXT: strb r1, [r0, #2]
+; BE-NEXT: lsr r1, r1, #8
+; BE-NEXT: strh r1, [r0]
; BE-NEXT: mov pc, lr
%aa = load i56, ptr %a
%b = or i56 %aa, 384
@@ -118,11 +124,20 @@ define void @i56_and_or(ptr %a) {
;
; BE-LABEL: i56_and_or:
; BE: @ %bb.0:
-; BE-NEXT: ldrh r1, [r0, #4]!
-; BE-NEXT: mov r2, #128
-; BE-NEXT: orr r1, r1, #1
-; BE-NEXT: strb r2, [r0, #2]
-; BE-NEXT: strh r1, [r0]
+; BE-NEXT: mov r1, r0
+; BE-NEXT: mov r3, #128
+; BE-NEXT: ldrh r2, [r1, #4]!
+; BE-NEXT: strb r3, [r1, #2]
+; BE-NEXT: mov r3, #256
+; BE-NEXT: orr r2, r3, r2, lsl #8
+; BE-NEXT: lsr r2, r2, #8
+; BE-NEXT: strh r2, [r1]
+; BE-NEXT: ldr r1, [r0]
+; BE-NEXT: lsl r2, r1, #24
+; BE-NEXT: lsr r1, r1, #8
+; BE-NEXT: lsl r1, r1, #8
+; BE-NEXT: orr r1, r1, r2, lsr #24
+; BE-NEXT: str r1, [r0]
; BE-NEXT: mov pc, lr
%b = load i56, ptr %a, align 1
@@ -143,6 +158,12 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
;
; BE-LABEL: i56_insert_bit:
; BE: @ %bb.0:
+; BE-NEXT: ldr r2, [r0]
+; BE-NEXT: lsl r3, r2, #24
+; BE-NEXT: lsr r2, r2, #8
+; BE-NEXT: lsl r2, r2, #8
+; BE-NEXT: orr r2, r2, r3, lsr #24
+; BE-NEXT: str r2, [r0]
; BE-NEXT: ldrh r2, [r0, #4]!
; BE-NEXT: mov r3, #57088
; BE-NEXT: orr r3, r3, #16711680
diff --git a/llvm/test/CodeGen/ARM/pr36577.ll b/llvm/test/CodeGen/ARM/pr36577.ll
index 83cde09b603bdd..8260b7d056be7b 100644
--- a/llvm/test/CodeGen/ARM/pr36577.ll
+++ b/llvm/test/CodeGen/ARM/pr36577.ll
@@ -12,21 +12,22 @@ define dso_local arm_aapcscc ptr @pr36577() {
; CHECK-LABEL: pr36577:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: movw r0, :lower16:a
-; CHECK-NEXT: mvn r1, #7
+; CHECK-NEXT: mvn r1, #1
; CHECK-NEXT: movt r0, :upper16:a
; CHECK-NEXT: ldrh r0, [r0]
; CHECK-NEXT: mvn r0, r0, lsr #7
-; CHECK-NEXT: orr r0, r1, r0, lsl #2
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: lsl r0, r0, #2
; CHECK-NEXT: bx lr
;
; CHECK-T2-LABEL: pr36577:
; CHECK-T2: @ %bb.0: @ %entry
; CHECK-T2-NEXT: movw r0, :lower16:a
-; CHECK-T2-NEXT: mvn r1, #7
+; CHECK-T2-NEXT: mvn r1, #1
; CHECK-T2-NEXT: movt r0, :upper16:a
; CHECK-T2-NEXT: ldrh r0, [r0]
-; CHECK-T2-NEXT: mvn.w r0, r0, lsr #7
-; CHECK-T2-NEXT: orr.w r0, r1, r0, lsl #2
+; CHECK-T2-NEXT: orn r0, r1, r0, lsr #7
+; CHECK-T2-NEXT: lsls r0, r0, #2
; CHECK-T2-NEXT: bx lr
entry:
%0 = load i16, ptr @a, align 2
diff --git a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
index 859aedc7a3f019..59df68b4f85635 100644
--- a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
@@ -249,6 +249,7 @@ define signext i4 @func4(i4 signext %x, i4 signext %y, i4 signext %z) nounwind {
; CHECK-T2DSP: @ %bb.0:
; CHECK-T2DSP-NEXT: muls r1, r2, r1
; CHECK-T2DSP-NEXT: lsls r0, r0, #28
+; CHECK-T2DSP-NEXT: sbfx r1, r1, #0, #4
; CHECK-T2DSP-NEXT: lsls r1, r1, #28
; CHECK-T2DSP-NEXT: qadd r0, r0, r1
; CHECK-T2DSP-NEXT: asrs r0, r0, #28
@@ -258,6 +259,7 @@ define signext i4 @func4(i4 signext %x, i4 signext %y, i4 signext %z) nounwind {
; CHECK-ARM: @ %bb.0:
; CHECK-ARM-NEXT: smulbb r1, r1, r2
; CHECK-ARM-NEXT: lsl r0, r0, #28
+; CHECK-ARM-NEXT: sbfx r1, r1, #0, #4
; CHECK-ARM-NEXT: lsl r1, r1, #28
; CHECK-ARM-NEXT: qadd r0, r0, r1
; CHECK-ARM-NEXT: asr r0, r0, #28
diff --git a/llvm/test/CodeGen/ARM/sbfx.ll b/llvm/test/CodeGen/ARM/sbfx.ll
index 72e9b5b1c9c425..8c13c7d33b4f00 100644
--- a/llvm/test/CodeGen/ARM/sbfx.ll
+++ b/llvm/test/CodeGen/ARM/sbfx.ll
@@ -15,7 +15,7 @@ entry:
define i32 @f2(i32 %a) {
; CHECK-LABEL: f2:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: bfc r0, #20, #12
+; CHECK-NEXT: ubfx r0, r0, #0, #20
; CHECK-NEXT: bx lr
entry:
%tmp = shl i32 %a, 12
diff --git a/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll b/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
index 58a5bf1bda71da..864f612d84e2a1 100644
--- a/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
+++ b/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
@@ -8,8 +8,8 @@ define dso_local i32 @test_rem(i32 %F) local_unnamed_addr #0 {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: asr r1, r0, #31
; CHECK-NEXT: add r1, r0, r1, lsr #30
-; CHECK-NEXT: bic r1, r1, #3
-; CHECK-NEXT: sub r0, r0, r1
+; CHECK-NEXT: asr r1, r1, #2
+; CHECK-NEXT: sub r0, r0, r1, lsl #2
; CHECK-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll
index 196d9340a7ce59..db8e4a73ec80af 100644
--- a/llvm/test/CodeGen/ARM/shift-combine.ll
+++ b/llvm/test/CodeGen/ARM/shift-combine.ll
@@ -9,28 +9,45 @@
@array = weak global [4 x i32] zeroinitializer
define i32 @test_lshr_and1(i32 %x) {
-; CHECK-COMMON-LABEL: test_lshr_and1:
-; CHECK-COMMON: @ %bb.0: @ %entry
-; CHECK-COMMON-NEXT: movw r1, :lower16:array
-; CHECK-COMMON-NEXT: and r0, r0, #12
-; CHECK-COMMON-NEXT: movt r1, :upper16:array
-; CHECK-COMMON-NEXT: ldr r0, [r1, r0]
-; CHECK-COMMON-NEXT: bx lr
+; CHECK-ARM-LABEL: test_lshr_and1:
+; CHECK-ARM: @ %bb.0: @ %entry
+; CHECK-ARM-NEXT: movw r1, :lower16:array
+; CHECK-ARM-NEXT: ubfx r0, r0, #2, #2
+; CHECK-ARM-NEXT: movt r1, :upper16:array
+; CHECK-ARM-NEXT: ldr r0, [r1, r0, lsl #2]
+; CHECK-ARM-NEXT: bx lr
;
; CHECK-BE-LABEL: test_lshr_and1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: movw r1, :lower16:array
-; CHECK-BE-NEXT: and r0, r0, #12
+; CHECK-BE-NEXT: ubfx r0, r0, #2, #2
; CHECK-BE-NEXT: movt r1, :upper16:array
-; CHECK-BE-NEXT: ldr r0, [r1, r0]
+; CHECK-BE-NEXT: ldr r0, [r1, r0, lsl #2]
; CHECK-BE-NEXT: bx lr
;
+; CHECK-THUMB-LABEL: test_lshr_and1:
+; CHECK-THUMB: @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT: movw r1, :lower16:array
+; CHECK-THUMB-NEXT: ubfx r0, r0, #2, #2
+; CHECK-THUMB-NEXT: movt r1, :upper16:array
+; CHECK-THUMB-NEXT: ldr.w r0, [r1, r0, lsl #2]
+; CHECK-THUMB-NEXT: bx lr
+;
+; CHECK-ALIGN-LABEL: test_lshr_and1:
+; CHECK-ALIGN: @ %bb.0: @ %entry
+; CHECK-ALIGN-NEXT: movw r1, :lower16:array
+; CHECK-ALIGN-NEXT: ubfx r0, r0, #2, #2
+; CHECK-ALIGN-NEXT: movt r1, :upper16:array
+; CHECK-ALIGN-NEXT: ldr.w r0, [r1, r0, lsl #2]
+; CHECK-ALIGN-NEXT: bx lr
+;
; CHECK-V6M-LABEL: test_lshr_and1:
; CHECK-V6M: @ %bb.0: @ %entry
-; CHECK-V6M-NEXT: movs r1, #12
-; CHECK-V6M-NEXT: ands r1, r0
-; CHECK-V6M-NEXT: ldr r0, .LCPI0_0
-; CHECK-V6M-NEXT: ldr r0, [r0, r1]
+; CHECK-V6M-NEXT: lsls r0, r0, #28
+; CHECK-V6M-NEXT: lsrs r0, r0, #30
+; CHECK-V6M-NEXT: lsls r0, r0, #2
+; CHECK-V6M-NEXT: ldr r1, .LCPI0_0
+; CHECK-V6M-NEXT: ldr r0, [r1, r0]
; CHECK-V6M-NEXT: bx lr
; CHECK-V6M-NEXT: .p2align 2
; CHECK-V6M-NEXT: @ %bb.1:
diff --git a/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll b/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll
index 838da59f9e412c..7ee000e5d8af10 100644
--- a/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll
+++ b/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll
@@ -170,25 +170,27 @@ define i1 @test_33_1_31(ptr %y) {
; CHECK-LE-LABEL: test_33_1_31:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldrb r0, [r0, #3]
+; CHECK-LE-NEXT: and r0, r0, #128
; CHECK-LE-NEXT: lsr r0, r0, #7
; CHECK-LE-NEXT: mov pc, lr
;
; CHECK-V7-LE-LABEL: test_33_1_31:
; CHECK-V7-LE: @ %bb.0:
; CHECK-V7-LE-NEXT: ldrb r0, [r0, #3]
-; CHECK-V7-LE-NEXT: lsr r0, r0, #7
+; CHECK-V7-LE-NEXT: ubfx r0, r0, #7, #1
; CHECK-V7-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: test_33_1_31:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldrb r0, [r0, #1]
+; CHECK-BE-NEXT: and r0, r0, #128
; CHECK-BE-NEXT: lsr r0, r0, #7
; CHECK-BE-NEXT: mov pc, lr
;
; CHECK-V7-BE-LABEL: test_33_1_31:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #1]
-; CHECK-V7-BE-NEXT: lsr r0, r0, #7
+; CHECK-V7-BE-NEXT: ubfx r0, r0, #7, #1
; CHECK-V7-BE-NEXT: bx lr
%a = load i33, ptr %y
%b = and i33 %a, u0x80000000
diff --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
index a4e081d5384e5e..4e95c37c0f324d 100644
--- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
@@ -115,11 +115,12 @@ define i1 @test_srem_even(i4 %X) nounwind {
; ARM5-LABEL: test_srem_even:
; ARM5: @ %bb.0:
; ARM5-NEXT: lsl r1, r0, #28
-; ARM5-NEXT: mov r2, #1
+; ARM5-NEXT: mov r2, #8
; ARM5-NEXT: asr r1, r1, #28
; ARM5-NEXT: add r1, r1, r1, lsl #1
-; ARM5-NEXT: and r2, r2, r1, lsr #7
-; ARM5-NEXT: add r1, r2, r1, lsr #4
+; ARM5-NEXT: and r2, r2, r1, lsr #4
+; ARM5-NEXT: lsr r1, r1, #4
+; ARM5-NEXT: add r1, r1, r2, lsr #3
; ARM5-NEXT: add r1, r1, r1, lsl #1
; ARM5-NEXT: sub r0, r0, r1, lsl #1
; ARM5-NEXT: and r0, r0, #15
@@ -131,11 +132,12 @@ define i1 @test_srem_even(i4 %X) nounwind {
; ARM6-LABEL: test_srem_even:
; ARM6: @ %bb.0:
; ARM6-NEXT: lsl r1, r0, #28
-; ARM6-NEXT: mov r2, #1
+; ARM6-NEXT: mov r2, #8
; ARM6-NEXT: asr r1, r1, #28
; ARM6-NEXT: add r1, r1, r1, lsl #1
-; ARM6-NEXT: and r2, r2, r1, lsr #7
-; ARM6-NEXT: add r1, r2, r1, lsr #4
+; ARM6-NEXT: and r2, r2, r1, lsr #4
+; ARM6-NEXT: lsr r1, r1, #4
+; ARM6-NEXT: add r1, r1, r2, lsr #3
; ARM6-NEXT: add r1, r1, r1, lsl #1
; ARM6-NEXT: sub r0, r0, r1, lsl #1
; ARM6-NEXT: and r0, r0, #15
@@ -147,9 +149,11 @@ define i1 @test_srem_even(i4 %X) nounwind {
; ARM7-LABEL: test_srem_even:
; ARM7: @ %bb.0:
; ARM7-NEXT: sbfx r1, r0, #0, #4
+; ARM7-NEXT: mov r2, #8
; ARM7-NEXT: add r1, r1, r1, lsl #1
-; ARM7-NEXT: ubfx r2, r1, #7, #1
-; ARM7-NEXT: add r1, r2, r1, lsr #4
+; ARM7-NEXT: and r2, r2, r1, lsr #4
+; ARM7-NEXT: lsr r1, r1, #4
+; ARM7-NEXT: add r1, r1, r2, lsr #3
; ARM7-NEXT: add r1, r1, r1, lsl #1
; ARM7-NEXT: sub r0, r0, r1, lsl #1
; ARM7-NEXT: and r0, r0, #15
@@ -161,9 +165,11 @@ define i1 @test_srem_even(i4 %X) nounwind {
; ARM8-LABEL: test_srem_even:
; ARM8: @ %bb.0:
; ARM8-NEXT: sbfx r1, r0, #0, #4
+; ARM8-NEXT: mov r2, #8
; ARM8-NEXT: add r1, r1, r1, lsl #1
-; ARM8-NEXT: ubfx r2, r1, #7, #1
-; ARM8-NEXT: add r1, r2, r1, lsr #4
+; ARM8-NEXT: and r2, r2, r1, lsr #4
+; ARM8-NEXT: lsr r1, r1, #4
+; ARM8-NEXT: add r1, r1, r2, lsr #3
; ARM8-NEXT: add r1, r1, r1, lsl #1
; ARM8-NEXT: sub r0, r0, r1, lsl #1
; ARM8-NEXT: and r0, r0, #15
@@ -175,9 +181,11 @@ define i1 @test_srem_even(i4 %X) nounwind {
; NEON7-LABEL: test_srem_even:
; NEON7: @ %bb.0:
; NEON7-NEXT: sbfx r1, r0, #0, #4
+; NEON7-NEXT: mov r2, #8
; NEON7-NEXT: add r1, r1, r1, lsl #1
-; NEON7-NEXT: ubfx r2, r1, #7, #1
-; NEON7-NEXT: add r1, r2, r1, lsr #4
+; NEON7-NEXT: and r2, r2, r1, lsr #4
+; NEON7-NEXT: lsr r1, r1, #4
+; NEON7-NEXT: add r1, r1, r2, lsr #3
; NEON7-NEXT: add r1, r1, r1, lsl #1
; NEON7-NEXT: sub r0, r0, r1, lsl #1
; NEON7-NEXT: and r0, r0, #15
@@ -189,9 +197,11 @@ define i1 @test_srem_even(i4 %X) nounwind {
; NEON8-LABEL: test_srem_even:
; NEON8: @ %bb.0:
; NEON8-NEXT: sbfx r1, r0, #0, #4
+; NEON8-NEXT: mov r2, #8
; NEON8-NEXT: add r1, r1, r1, lsl #1
-; NEON8-NEXT: ubfx r2, r1, #7, #1
-; NEON8-NEXT: add r1, r2, r1, lsr #4
+; NEON8-NEXT: and r2, r2, r1, lsr #4
+; NEON8-NEXT: lsr r1, r1, #4
+; NEON8-NEXT: add r1, r1, r2, lsr #3
; NEON8-NEXT: add r1, r1, r1, lsl #1
; NEON8-NEXT: sub r0, r0, r1, lsl #1
; NEON8-NEXT: and r0, r0, #15
@@ -208,12 +218,12 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; ARM5-LABEL: test_srem_pow2_setne:
; ARM5: @ %bb.0:
; ARM5-NEXT: lsl r1, r0, #26
-; ARM5-NEXT: mov r2, #3
+; ARM5-NEXT: mov r2, #48
; ARM5-NEXT: asr r1, r1, #26
-; ARM5-NEXT: and r1, r2, r1, lsr #9
-; ARM5-NEXT: add r1, r0, r1
-; ARM5-NEXT: and r1, r1, #60
-; ARM5-NEXT: sub r0, r0, r1
+; ARM5-NEXT: and r1, r2, r1, lsr #5
+; ARM5-NEXT: add r1, r0, r1, lsr #4
+; ARM5-NEXT: lsr r1, r1, #2
+; ARM5-NEXT: sub r0, r0, r1, lsl #2
; ARM5-NEXT: ands r0, r0, #63
; ARM5-NEXT: movne r0, #1
; ARM5-NEXT: bx lr
@@ -221,12 +231,12 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; ARM6-LABEL: test_srem_pow2_setne:
; ARM6: @ %bb.0:
; ARM6-NEXT: lsl r1, r0, #26
-; ARM6-NEXT: mov r2, #3
+; ARM6-NEXT: mov r2, #48
; ARM6-NEXT: asr r1, r1, #26
-; ARM6-NEXT: and r1, r2, r1, lsr #9
-; ARM6-NEXT: add r1, r0, r1
-; ARM6-NEXT: and r1, r1, #60
-; ARM6-NEXT: sub r0, r0, r1
+; ARM6-NEXT: and r1, r2, r1, lsr #5
+; ARM6-NEXT: add r1, r0, r1, lsr #4
+; ARM6-NEXT: lsr r1, r1, #2
+; ARM6-NEXT: sub r0, r0, r1, lsl #2
; ARM6-NEXT: ands r0, r0, #63
; ARM6-NEXT: movne r0, #1
; ARM6-NEXT: bx lr
@@ -234,10 +244,11 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; ARM7-LABEL: test_srem_pow2_setne:
; ARM7: @ %bb.0:
; ARM7-NEXT: sbfx r1, r0, #0, #6
-; ARM7-NEXT: ubfx r1, r1, #9, #2
-; ARM7-NEXT: add r1, r0, r1
-; ARM7-NEXT: and r1, r1, #60
-; ARM7-NEXT: sub r0, r0, r1
+; ARM7-NEXT: mov r2, #48
+; ARM7-NEXT: and r1, r2, r1, lsr #5
+; ARM7-NEXT: add r1, r0, r1, lsr #4
+; ARM7-NEXT: lsr r1, r1, #2
+; ARM7-NEXT: sub r0, r0, r1, lsl #2
; ARM7-NEXT: ands r0, r0, #63
; ARM7-NEXT: movwne r0, #1
; ARM7-NEXT: bx lr
@@ -245,10 +256,11 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; ARM8-LABEL: test_srem_pow2_setne:
; ARM8: @ %bb.0:
; ARM8-NEXT: sbfx r1, r0, #0, #6
-; ARM8-NEXT: ubfx r1, r1, #9, #2
-; ARM8-NEXT: add r1, r0, r1
-; ARM8-NEXT: and r1, r1, #60
-; ARM8-NEXT: sub r0, r0, r1
+; ARM8-NEXT: mov r2, #48
+; ARM8-NEXT: and r1, r2, r1, lsr #5
+; ARM8-NEXT: add r1, r0, r1, lsr #4
+; ARM8-NEXT: lsr r1, r1, #2
+; ARM8-NEXT: sub r0, r0, r1, lsl #2
; ARM8-NEXT: ands r0, r0, #63
; ARM8-NEXT: movwne r0, #1
; ARM8-NEXT: bx lr
@@ -256,10 +268,11 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; NEON7-LABEL: test_srem_pow2_setne:
; NEON7: @ %bb.0:
; NEON7-NEXT: sbfx r1, r0, #0, #6
-; NEON7-NEXT: ubfx r1, r1, #9, #2
-; NEON7-NEXT: add r1, r0, r1
-; NEON7-NEXT: and r1, r1, #60
-; NEON7-NEXT: sub r0, r0, r1
+; NEON7-NEXT: mov r2, #48
+; NEON7-NEXT: and r1, r2, r1, lsr #5
+; NEON7-NEXT: add r1, r0, r1, lsr #4
+; NEON7-NEXT: lsr r1, r1, #2
+; NEON7-NEXT: sub r0, r0, r1, lsl #2
; NEON7-NEXT: ands r0, r0, #63
; NEON7-NEXT: movwne r0, #1
; NEON7-NEXT: bx lr
@@ -267,10 +280,11 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; NEON8-LABEL: test_srem_pow2_setne:
; NEON8: @ %bb.0:
; NEON8-NEXT: sbfx r1, r0, #0, #6
-; NEON8-NEXT: ubfx r1, r1, #9, #2
-; NEON8-NEXT: add r1, r0, r1
-; NEON8-NEXT: and r1, r1, #60
-; NEON8-NEXT: sub r0, r0, r1
+; NEON8-NEXT: mov r2, #48
+; NEON8-NEXT: and r1, r2, r1, lsr #5
+; NEON8-NEXT: add r1, r0, r1, lsr #4
+; NEON8-NEXT: lsr r1, r1, #2
+; NEON8-NEXT: sub r0, r0, r1, lsl #2
; NEON8-NEXT: ands r0, r0, #63
; NEON8-NEXT: movwne r0, #1
; NEON8-NEXT: bx lr
diff --git a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
index 0a2d1f0e7a240e..0c1af4e35ab23e 100644
--- a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
@@ -251,6 +251,7 @@ define signext i4 @func4(i4 signext %x, i4 signext %y, i4 signext %z) nounwind {
; CHECK-T2DSP: @ %bb.0:
; CHECK-T2DSP-NEXT: muls r1, r2, r1
; CHECK-T2DSP-NEXT: lsls r0, r0, #28
+; CHECK-T2DSP-NEXT: sbfx r1, r1, #0, #4
; CHECK-T2DSP-NEXT: lsls r1, r1, #28
; CHECK-T2DSP-NEXT: qsub r0, r0, r1
; CHECK-T2DSP-NEXT: asrs r0, r0, #28
@@ -260,6 +261,7 @@ define signext i4 @func4(i4 signext %x, i4 signext %y, i4 signext %z) nounwind {
; CHECK-ARM: @ %bb.0:
; CHECK-ARM-NEXT: smulbb r1, r1, r2
; CHECK-ARM-NEXT: lsl r0, r0, #28
+; CHECK-ARM-NEXT: sbfx r1, r1, #0, #4
; CHECK-ARM-NEXT: lsl r1, r1, #28
; CHECK-ARM-NEXT: qsub r0, r0, r1
; CHECK-ARM-NEXT: asr r0, r0, #28
diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
index d51c9554a022c2..e596862d231138 100644
--- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
@@ -6,13 +6,16 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = and(#24,asl(r0,#3))
-; CHECK-NEXT: r2 = and(r0,#-4)
+; CHECK-NEXT: r2 = and(r0,#3)
; CHECK-NEXT: r3 = #255
+; CHECK-NEXT: r1 = and(r1,#255)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r4 = asl(r3,r0)
-; CHECK-NEXT: r3 = and(r1,#255)
+; CHECK-NEXT: r2 = asl(r2,#3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = asl(r3,r2)
+; CHECK-NEXT: r3 = and(r0,#-4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = sub(#-1,r4)
@@ -21,35 +24,34 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
; CHECK-NEXT: .LBB0_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: {
-; CHECK-NEXT: r5 = memw_locked(r2)
+; CHECK-NEXT: r5 = memw_locked(r3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r1 = lsr(r5,r0)
+; CHECK-NEXT: r0 = lsr(r5,r2)
; CHECK-NEXT: r5 = and(r5,r4)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r6 = and(r1,#255)
+; CHECK-NEXT: r6 = and(r0,#255)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: p0 = cmp.gtu(r3,r6)
-; CHECK-NEXT: if (p0.new) r6 = add(r1,#1)
+; CHECK-NEXT: p0 = cmp.gtu(r1,r6)
+; CHECK-NEXT: if (p0.new) r6 = add(r0,#1)
; CHECK-NEXT: if (!p0.new) r6 = #0
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r6 = and(r6,#255)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r5 |= asl(r6,r0)
+; CHECK-NEXT: r5 |= asl(r6,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: memw_locked(r2,p0) = r5
+; CHECK-NEXT: memw_locked(r3,p0) = r5
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: if (!p0) jump:nt .LBB0_1
; CHECK-NEXT: }
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = r1
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
%result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
@@ -61,13 +63,16 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = and(#24,asl(r0,#3))
-; CHECK-NEXT: r2 = and(r0,#-4)
+; CHECK-NEXT: r2 = and(r0,#3)
; CHECK-NEXT: r3 = ##65535
+; CHECK-NEXT: r1 = zxth(r1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r4 = asl(r3,r0)
-; CHECK-NEXT: r3 = zxth(r1)
+; CHECK-NEXT: r2 = asl(r2,#3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = asl(r3,r2)
+; CHECK-NEXT: r3 = and(r0,#-4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = sub(#-1,r4)
@@ -76,35 +81,34 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
; CHECK-NEXT: .LBB1_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: {
-; CHECK-NEXT: r5 = memw_locked(r2)
+; CHECK-NEXT: r5 = memw_locked(r3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r1 = lsr(r5,r0)
+; CHECK-NEXT: r0 = lsr(r5,r2)
; CHECK-NEXT: r5 = and(r5,r4)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r6 = zxth(r1)
+; CHECK-NEXT: r6 = zxth(r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: p0 = cmp.gtu(r3,r6)
-; CHECK-NEXT: if (p0.new) r6 = add(r1,#1)
+; CHECK-NEXT: p0 = cmp.gtu(r1,r6)
+; CHECK-NEXT: if (p0.new) r6 = add(r0,#1)
; CHECK-NEXT: if (!p0.new) r6 = #0
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r6 = zxth(r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r5 |= asl(r6,r0)
+; CHECK-NEXT: r5 |= asl(r6,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: memw_locked(r2,p0) = r5
+; CHECK-NEXT: memw_locked(r3,p0) = r5
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: if (!p0) jump:nt .LBB1_1
; CHECK-NEXT: }
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = r1
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
%result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
@@ -183,35 +187,38 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = and(#24,asl(r0,#3))
-; CHECK-NEXT: r3 = and(r0,#-4)
-; CHECK-NEXT: r4 = #255
+; CHECK-NEXT: r3 = and(r0,#3)
+; CHECK-NEXT: r2 = #255
+; CHECK-NEXT: r4 = and(r0,#-4)
; CHECK-NEXT: r5 = and(r1,#255)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r2 = asl(r4,r0)
+; CHECK-NEXT: r3 = asl(r3,#3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r6 = sub(#-1,r2)
+; CHECK-NEXT: r6 = asl(r2,r3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r6 = sub(#-1,r6)
; CHECK-NEXT: }
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB4_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: {
-; CHECK-NEXT: r7 = memw_locked(r3)
+; CHECK-NEXT: r7 = memw_locked(r4)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r2 = lsr(r7,r0)
+; CHECK-NEXT: r0 = lsr(r7,r3)
; CHECK-NEXT: r7 = and(r7,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: p0 = bitsclr(r2,r4)
-; CHECK-NEXT: r8 = and(r2,#255)
+; CHECK-NEXT: p0 = bitsclr(r0,r2)
+; CHECK-NEXT: r8 = and(r0,#255)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: p1 = cmp.gtu(r8,r5)
; CHECK-NEXT: if (p1.new) r8 = add(r1,#0)
-; CHECK-NEXT: if (!p1.new) r8 = add(r2,#-1)
+; CHECK-NEXT: if (!p1.new) r8 = add(r0,#-1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: if (p0) r8 = add(r1,#0)
@@ -220,17 +227,16 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
; CHECK-NEXT: r8 = and(r8,#255)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r7 |= asl(r8,r0)
+; CHECK-NEXT: r7 |= asl(r8,r3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: memw_locked(r3,p0) = r7
+; CHECK-NEXT: memw_locked(r4,p0) = r7
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: if (!p0) jump:nt .LBB4_1
; CHECK-NEXT: }
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = r2
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
%result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
@@ -242,35 +248,38 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = and(#24,asl(r0,#3))
-; CHECK-NEXT: r3 = and(r0,#-4)
-; CHECK-NEXT: r4 = ##65535
+; CHECK-NEXT: r3 = and(r0,#3)
+; CHECK-NEXT: r2 = ##65535
+; CHECK-NEXT: r4 = and(r0,#-4)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r2 = asl(r4,r0)
+; CHECK-NEXT: r3 = asl(r3,#3)
; CHECK-NEXT: r5 = zxth(r1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r6 = sub(#-1,r2)
+; CHECK-NEXT: r6 = asl(r2,r3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r6 = sub(#-1,r6)
; CHECK-NEXT: }
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB5_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: {
-; CHECK-NEXT: r7 = memw_locked(r3)
+; CHECK-NEXT: r7 = memw_locked(r4)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r2 = lsr(r7,r0)
+; CHECK-NEXT: r0 = lsr(r7,r3)
; CHECK-NEXT: r7 = and(r7,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: p0 = bitsclr(r2,r4)
-; CHECK-NEXT: r8 = zxth(r2)
+; CHECK-NEXT: p0 = bitsclr(r0,r2)
+; CHECK-NEXT: r8 = zxth(r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: p1 = cmp.gtu(r8,r5)
; CHECK-NEXT: if (p1.new) r8 = add(r1,#0)
-; CHECK-NEXT: if (!p1.new) r8 = add(r2,#-1)
+; CHECK-NEXT: if (!p1.new) r8 = add(r0,#-1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: if (p0) r8 = add(r1,#0)
@@ -279,17 +288,16 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
; CHECK-NEXT: r8 = zxth(r8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r7 |= asl(r8,r0)
+; CHECK-NEXT: r7 |= asl(r8,r3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: memw_locked(r3,p0) = r7
+; CHECK-NEXT: memw_locked(r4,p0) = r7
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: if (!p0) jump:nt .LBB5_1
; CHECK-NEXT: }
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = r2
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
%result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
diff --git a/llvm/test/CodeGen/Hexagon/isel-global-offset-alignment.ll b/llvm/test/CodeGen/Hexagon/isel-global-offset-alignment.ll
index af479fde7ce35a..b91c9834b9e140 100644
--- a/llvm/test/CodeGen/Hexagon/isel-global-offset-alignment.ll
+++ b/llvm/test/CodeGen/Hexagon/isel-global-offset-alignment.ll
@@ -14,13 +14,13 @@ define void @fred(i1 %x) #0 {
; CHECK: // %bb.0: // %b0
; CHECK-NEXT: {
; CHECK-NEXT: p0 = tstbit(r0,#0)
-; CHECK-NEXT: if (!p0.new) r2 = #1024
+; CHECK-NEXT: if (!p0.new) r2 = #512
; CHECK-NEXT: if (p0.new) r2 = #0
; CHECK-NEXT: r5:4 = combine(#0,#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: memd(r2+##array+184) = r5:4
-; CHECK-NEXT: memd(r2+##array+176) = r5:4
+; CHECK-NEXT: memd(r2<<#1+##array+184) = r5:4
+; CHECK-NEXT: memd(r2<<#1+##array+176) = r5:4
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
diff --git a/llvm/test/CodeGen/Hexagon/vect/vect-shifts.ll b/llvm/test/CodeGen/Hexagon/vect/vect-shifts.ll
index 46f73c5e0e81b8..b6d4a2ed3aad72 100644
--- a/llvm/test/CodeGen/Hexagon/vect/vect-shifts.ll
+++ b/llvm/test/CodeGen/Hexagon/vect/vect-shifts.ll
@@ -246,21 +246,21 @@ define <4 x i8> @f15(<4 x i8> %a0) unnamed_addr #0 {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = extract(r0,#8,#24)
-; CHECK-NEXT: r2 = asl(r2,#6)
+; CHECK-NEXT: r3 = extractu(r3,#8,#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r3 = extractu(r3,#8,#1)
-; CHECK-NEXT: r0 = asl(r4,#4)
+; CHECK-NEXT: r2 = extractu(r2,#8,#2)
+; CHECK-NEXT: r0 = extractu(r4,#8,#4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r1 = extractu(r1,#8,#3)
-; CHECK-NEXT: r2 = or(r3,and(r2,##65280))
+; CHECK-NEXT: r3 |= asl(r2,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = or(r1,and(r0,##65280))
+; CHECK-NEXT: r1 |= asl(r0,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = combine(r0.l,r2.l)
+; CHECK-NEXT: r0 = combine(r1.l,r3.l)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
b0:
@@ -272,21 +272,22 @@ define <4 x i8> @f16(<4 x i8> %a0) unnamed_addr #0 {
; CHECK-LABEL: f16:
; CHECK: // %bb.0: // %b0
; CHECK-NEXT: {
-; CHECK-NEXT: r1 = extractu(r0,#8,#8)
-; CHECK-NEXT: r2 = extractu(r0,#8,#24)
+; CHECK-NEXT: r1 = extractu(r0,#8,#24)
+; CHECK-NEXT: r2 = extractu(r0,#5,#19)
; CHECK-NEXT: }
; CHECK-NEXT: {
+; CHECK-NEXT: r1 = extractu(r1,#4,#4)
; CHECK-NEXT: r3 = extractu(r0,#7,#1)
-; CHECK-NEXT: r4 = extractu(r0,#5,#19)
-; CHECK-NEXT: r1 = and(r1,#252)
-; CHECK-NEXT: r2 = and(r2,#240)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r3 |= asl(r1,#6)
-; CHECK-NEXT: r4 |= asl(r2,#4)
+; CHECK-NEXT: r4 = extractu(r0,#6,#10)
+; CHECK-NEXT: r2 |= asl(r1,#8)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 |= asl(r4,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = combine(r4.l,r3.l)
+; CHECK-NEXT: r0 = combine(r2.l,r3.l)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
b0:
@@ -303,14 +304,18 @@ define <4 x i8> @f17(<4 x i8> %a0) unnamed_addr #0 {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = extractu(r0,#8,#24)
-; CHECK-NEXT: r1 = and(#248,asl(r1,#3))
+; CHECK-NEXT: r2 = and(#252,asl(r2,#2))
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r0 = and(#254,asl(r0,#1))
-; CHECK-NEXT: r1 = insert(r3,#4,#12)
+; CHECK-NEXT: r3 = and(#240,asl(r3,#4))
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r1 = and(#248,asl(r1,#3))
+; CHECK-NEXT: r0 |= asl(r2,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = insert(r2,#6,#10)
+; CHECK-NEXT: r1 |= asl(r3,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r0 = combine(r1.l,r0.l)
@@ -326,53 +331,55 @@ define <8 x i8> @f18(<8 x i8> %a0) unnamed_addr #0 {
; CHECK: // %bb.0: // %b0
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = extractu(r1:0,#8,#48)
-; CHECK-NEXT: r5:4 = extractu(r1:0,#8,#24)
+; CHECK-NEXT: r5:4 = extractu(r1:0,#8,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r7:6 = extractu(r1:0,#8,#16)
-; CHECK-NEXT: r5 = extract(r0,#8,#8)
+; CHECK-NEXT: r7:6 = extractu(r1:0,#8,#24)
+; CHECK-NEXT: r9:8 = extractu(r1:0,#8,#16)
; CHECK-NEXT: r3 = sxtb(r0)
-; CHECK-NEXT: r2 = sxtb(r2)
+; CHECK-NEXT: r4 = sxtb(r4)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r7 = extract(r1,#8,#8)
-; CHECK-NEXT: r9:8 = extractu(r1:0,#8,#32)
+; CHECK-NEXT: r13:12 = extractu(r1:0,#8,#40)
+; CHECK-NEXT: r15:14 = extractu(r1:0,#8,#32)
+; CHECK-NEXT: r5 = sxtb(r6)
+; CHECK-NEXT: r2 = sxtb(r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r1:0 = extractu(r1:0,#8,#56)
; CHECK-NEXT: r3 = extractu(r3,#8,#1)
+; CHECK-NEXT: r6 = sxtb(r12)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r5 = asl(r5,#6)
-; CHECK-NEXT: r6 = sxtb(r8)
-; CHECK-NEXT: r1 = sxtb(r4)
-; CHECK-NEXT: r4 = sxtb(r6)
-; CHECK-NEXT: }
-; CHECK-NEXT: {
-; CHECK-NEXT: r5 = or(r3,and(r5,##65280))
-; CHECK-NEXT: r3 = asl(r7,#5)
+; CHECK-NEXT: r4 = extractu(r4,#8,#2)
+; CHECK-NEXT: r6 = extractu(r6,#8,#3)
+; CHECK-NEXT: r1 = sxtb(r8)
; CHECK-NEXT: r0 = sxtb(r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r6 = extractu(r6,#8,#2)
-; CHECK-NEXT: r1 = asl(r1,#4)
+; CHECK-NEXT: r3 |= asl(r4,#8)
+; CHECK-NEXT: r4 = extractu(r5,#8,#4)
+; CHECK-NEXT: r5 = sxtb(r14)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r4 = extractu(r4,#8,#3)
-; CHECK-NEXT: r7 = asl(r0,#7)
+; CHECK-NEXT: r1 = extractu(r1,#8,#3)
+; CHECK-NEXT: r5 = extractu(r5,#8,#2)
; CHECK-NEXT: }
; CHECK-NEXT: {
+; CHECK-NEXT: r7 = extractu(r0,#8,#1)
; CHECK-NEXT: r2 = extractu(r2,#8,#4)
-; CHECK-NEXT: r1 = or(r4,and(r1,##65280))
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r3 = or(r6,and(r3,##65280))
-; CHECK-NEXT: r7 = or(r2,and(r7,##65280))
+; CHECK-NEXT: r1 |= asl(r4,#8)
+; CHECK-NEXT: r5 |= asl(r6,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = combine(r1.l,r5.l)
+; CHECK-NEXT: r2 |= asl(r7,#8)
+; CHECK-NEXT: r0 = combine(r1.l,r3.l)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r1 = combine(r2.l,r5.l)
; CHECK-NEXT: jumpr r31
-; CHECK-NEXT: r1 = combine(r7.l,r3.l)
; CHECK-NEXT: }
b0:
%v0 = ashr <8 x i8> %a0, <i8 1, i8 2, i8 3, i8 4, i8 2, i8 3, i8 4, i8 1>
@@ -383,36 +390,36 @@ define <8 x i8> @f19(<8 x i8> %a0) unnamed_addr #0 {
; CHECK-LABEL: f19:
; CHECK: // %bb.0: // %b0
; CHECK-NEXT: {
-; CHECK-NEXT: r3:2 = extractu(r1:0,#8,#8)
-; CHECK-NEXT: r5:4 = extractu(r1:0,#8,#24)
+; CHECK-NEXT: r3:2 = extractu(r1:0,#8,#24)
+; CHECK-NEXT: r5:4 = extractu(r1:0,#8,#56)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = extractu(r1,#4,#20)
-; CHECK-NEXT: r5 = extractu(r0,#7,#1)
-; CHECK-NEXT: r2 = and(r2,#252)
-; CHECK-NEXT: r4 = and(r4,#240)
+; CHECK-NEXT: r4 = extractu(r4,#7,#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r6 = extractu(r0,#5,#19)
-; CHECK-NEXT: r9:8 = extractu(r1:0,#8,#40)
+; CHECK-NEXT: r7 = extractu(r1,#5,#11)
+; CHECK-NEXT: r1 = extractu(r1,#6,#2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r7 = extractu(r1,#6,#2)
-; CHECK-NEXT: r1:0 = extractu(r1:0,#8,#56)
+; CHECK-NEXT: r2 = extractu(r2,#4,#4)
+; CHECK-NEXT: r5 = extractu(r0,#5,#19)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r5 |= asl(r2,#6)
-; CHECK-NEXT: r6 |= asl(r4,#4)
-; CHECK-NEXT: r1 = and(r8,#248)
-; CHECK-NEXT: r11 = and(r0,#254)
+; CHECK-NEXT: r6 = extractu(r0,#7,#1)
+; CHECK-NEXT: r0 = extractu(r0,#6,#10)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r7 |= asl(r1,#5)
-; CHECK-NEXT: r3 |= asl(r11,#7)
-; CHECK-NEXT: r0 = combine(r6.l,r5.l)
+; CHECK-NEXT: r6 |= asl(r0,#8)
+; CHECK-NEXT: r5 |= asl(r2,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r1 = combine(r3.l,r7.l)
+; CHECK-NEXT: r1 |= asl(r7,#8)
+; CHECK-NEXT: r3 |= asl(r4,#8)
+; CHECK-NEXT: r0 = combine(r5.l,r6.l)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r1 = combine(r3.l,r1.l)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
b0:
@@ -424,38 +431,48 @@ define <8 x i8> @f20(<8 x i8> %a0) unnamed_addr #0 {
; CHECK-LABEL: f20:
; CHECK: // %bb.0: // %b0
; CHECK-NEXT: {
-; CHECK-NEXT: r5:4 = extractu(r1:0,#8,#16)
-; CHECK-NEXT: r3:2 = extractu(r1:0,#8,#48)
+; CHECK-NEXT: r3:2 = extractu(r1:0,#8,#24)
+; CHECK-NEXT: r5:4 = extractu(r1:0,#8,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r4 = and(#248,asl(r4,#3))
-; CHECK-NEXT: r15:14 = extractu(r1:0,#8,#32)
-; CHECK-NEXT: r5 = r0
+; CHECK-NEXT: r2 = and(#240,asl(r2,#4))
+; CHECK-NEXT: r9:8 = extractu(r1:0,#8,#16)
+; CHECK-NEXT: r3 = r0
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r13:12 = extractu(r1:0,#8,#24)
-; CHECK-NEXT: r7:6 = extractu(r1:0,#8,#56)
-; CHECK-NEXT: r3 = r4
+; CHECK-NEXT: r7:6 = extractu(r1:0,#8,#48)
+; CHECK-NEXT: r3 = and(#254,asl(r3,#1))
+; CHECK-NEXT: r5 = r2
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r2 = and(#240,asl(r2,#4))
-; CHECK-NEXT: r9:8 = extractu(r1:0,#8,#8)
+; CHECK-NEXT: r5 = insert(r8,#5,#3)
+; CHECK-NEXT: r4 = and(#252,asl(r4,#2))
+; CHECK-NEXT: r7 = r3
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r13:12 = extractu(r1:0,#8,#40)
+; CHECK-NEXT: r11:10 = extractu(r1:0,#8,#32)
+; CHECK-NEXT: r9 = r5
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r1:0 = extractu(r1:0,#8,#56)
+; CHECK-NEXT: r7 |= asl(r4,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r5 = and(#254,asl(r5,#1))
-; CHECK-NEXT: r4 = insert(r14,#6,#2)
+; CHECK-NEXT: r9 |= asl(r2,#8)
+; CHECK-NEXT: r3 = insert(r0,#7,#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r1:0 = extractu(r1:0,#8,#40)
-; CHECK-NEXT: r3 = insert(r12,#4,#12)
+; CHECK-NEXT: r2 = insert(r6,#4,#4)
+; CHECK-NEXT: r5 = insert(r12,#5,#3)
+; CHECK-NEXT: r0 = combine(r9.l,r7.l)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r5 = insert(r8,#6,#10)
-; CHECK-NEXT: r4 = insert(r0,#5,#11)
+; CHECK-NEXT: r4 = insert(r10,#6,#2)
+; CHECK-NEXT: r2 |= asl(r3,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r2 = insert(r6,#7,#9)
-; CHECK-NEXT: r0 = combine(r3.l,r5.l)
+; CHECK-NEXT: r4 |= asl(r5,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r1 = combine(r2.l,r4.l)
diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
index b95c2e24737a50..bd38081f84c279 100644
--- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
@@ -4,12 +4,12 @@
define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
; LA64-LABEL: atomicrmw_uinc_wrap_i8:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a2, $a3, 24
-; LA64-NEXT: ori $a5, $zero, 255
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: ld.w $a4, $a0, 0
-; LA64-NEXT: sll.w $a3, $a5, $a3
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: nor $a3, $a3, $zero
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: .p2align 4, , 16
@@ -54,13 +54,13 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
; LA64-LABEL: atomicrmw_uinc_wrap_i16:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a2, $a3, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a5, $a4, 4095
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: ld.w $a4, $a0, 0
-; LA64-NEXT: sll.w $a3, $a5, $a3
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: nor $a3, $a3, $zero
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: .p2align 4, , 16
@@ -180,12 +180,12 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
; LA64-LABEL: atomicrmw_udec_wrap_i8:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a2, $a3, 24
-; LA64-NEXT: ori $a4, $zero, 255
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: ld.w $a5, $a0, 0
-; LA64-NEXT: sll.w $a3, $a4, $a3
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: nor $a3, $a3, $zero
; LA64-NEXT: andi $a4, $a1, 255
; LA64-NEXT: .p2align 4, , 16
@@ -235,13 +235,13 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
; LA64-LABEL: atomicrmw_udec_wrap_i16:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a2, $a3, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: ld.w $a5, $a0, 0
-; LA64-NEXT: sll.w $a3, $a4, $a3
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: nor $a3, $a3, $zero
; LA64-NEXT: bstrpick.d $a4, $a1, 15, 0
; LA64-NEXT: .p2align 4, , 16
diff --git a/llvm/test/CodeGen/LoongArch/bstrins_w.ll b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
index e008caacad2a17..af871ed95793a2 100644
--- a/llvm/test/CodeGen/LoongArch/bstrins_w.ll
+++ b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
@@ -94,6 +94,7 @@ define i32 @pat3_swap(i32 %a, i32 %b) nounwind {
define i32 @pat3_positive_mask0(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: pat3_positive_mask0:
; CHECK: # %bb.0:
+; CHECK-NEXT: bstrins.w $a1, $zero, 27, 0
; CHECK-NEXT: srli.w $a1, $a1, 28
; CHECK-NEXT: bstrins.w $a0, $a1, 31, 28
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/bstrpick_d.ll b/llvm/test/CodeGen/LoongArch/bstrpick_d.ll
index e93c1391d463f0..5851bbe92d4512 100644
--- a/llvm/test/CodeGen/LoongArch/bstrpick_d.ll
+++ b/llvm/test/CodeGen/LoongArch/bstrpick_d.ll
@@ -53,7 +53,8 @@ define i64 @and4095(i64 %a) {
define i64 @and0xff0_lshr4(i64 %a) {
; CHECK-LABEL: and0xff0_lshr4:
; CHECK: # %bb.0:
-; CHECK-NEXT: bstrpick.d $a0, $a0, 11, 4
+; CHECK-NEXT: andi $a0, $a0, 4080
+; CHECK-NEXT: srli.d $a0, $a0, 4
; CHECK-NEXT: ret
%and = and i64 %a, 4080
%shr = lshr i64 %and, 4
@@ -66,7 +67,8 @@ define i64 @and0xff0_lshr4(i64 %a) {
define i64 @and4080_ashr5(i64 %a) {
; CHECK-LABEL: and4080_ashr5:
; CHECK: # %bb.0:
-; CHECK-NEXT: bstrpick.d $a0, $a0, 11, 5
+; CHECK-NEXT: andi $a0, $a0, 4064
+; CHECK-NEXT: srai.d $a0, $a0, 5
; CHECK-NEXT: ret
%and = and i64 %a, 4080
%shr = ashr i64 %and, 5
diff --git a/llvm/test/CodeGen/LoongArch/bstrpick_w.ll b/llvm/test/CodeGen/LoongArch/bstrpick_w.ll
index f9027e1fb32df3..974c34b17e7893 100644
--- a/llvm/test/CodeGen/LoongArch/bstrpick_w.ll
+++ b/llvm/test/CodeGen/LoongArch/bstrpick_w.ll
@@ -53,7 +53,8 @@ define i32 @and4095(i32 %a) {
define i32 @and0xff0_lshr4(i32 %a) {
; CHECK-LABEL: and0xff0_lshr4:
; CHECK: # %bb.0:
-; CHECK-NEXT: bstrpick.w $a0, $a0, 11, 4
+; CHECK-NEXT: andi $a0, $a0, 4080
+; CHECK-NEXT: srli.w $a0, $a0, 4
; CHECK-NEXT: ret
%and = and i32 %a, 4080
%shr = lshr i32 %and, 4
@@ -66,7 +67,8 @@ define i32 @and0xff0_lshr4(i32 %a) {
define i32 @and4080_ashr5(i32 %a) {
; CHECK-LABEL: and4080_ashr5:
; CHECK: # %bb.0:
-; CHECK-NEXT: bstrpick.w $a0, $a0, 11, 5
+; CHECK-NEXT: andi $a0, $a0, 4064
+; CHECK-NEXT: srai.w $a0, $a0, 5
; CHECK-NEXT: ret
%and = and i32 %a, 4080
%shr = ashr i32 %and, 5
diff --git a/llvm/test/CodeGen/LoongArch/bytepick.ll b/llvm/test/CodeGen/LoongArch/bytepick.ll
index 1a2cd48448ba28..191728c285fd35 100644
--- a/llvm/test/CodeGen/LoongArch/bytepick.ll
+++ b/llvm/test/CodeGen/LoongArch/bytepick.ll
@@ -14,7 +14,10 @@ define i32 @pick_i32_1(i32 %a, i32 %b) {
;
; LA64-LABEL: pick_i32_1:
; LA64: # %bb.0:
-; LA64-NEXT: bstrpick.d $a1, $a1, 31, 24
+; LA64-NEXT: lu12i.w $a2, -4096
+; LA64-NEXT: lu32i.d $a2, 0
+; LA64-NEXT: and $a1, $a1, $a2
+; LA64-NEXT: srli.d $a1, $a1, 24
; LA64-NEXT: slli.d $a0, $a0, 8
; LA64-NEXT: or $a0, $a1, $a0
; LA64-NEXT: ret
@@ -34,7 +37,13 @@ define signext i32 @pick_i32_1_sext(i32 %a, i32 %b) {
;
; LA64-LABEL: pick_i32_1_sext:
; LA64: # %bb.0:
-; LA64-NEXT: bytepick.w $a0, $a1, $a0, 1
+; LA64-NEXT: lu12i.w $a2, -4096
+; LA64-NEXT: lu32i.d $a2, 0
+; LA64-NEXT: and $a1, $a1, $a2
+; LA64-NEXT: srli.d $a1, $a1, 24
+; LA64-NEXT: slli.d $a0, $a0, 8
+; LA64-NEXT: or $a0, $a1, $a0
+; LA64-NEXT: addi.w $a0, $a0, 0
; LA64-NEXT: ret
%1 = lshr i32 %b, 24
%2 = shl i32 %a, 8
@@ -52,7 +61,10 @@ define i32 @pick_i32_2(i32 %a, i32 %b) {
;
; LA64-LABEL: pick_i32_2:
; LA64: # %bb.0:
-; LA64-NEXT: bstrpick.d $a1, $a1, 31, 16
+; LA64-NEXT: lu12i.w $a2, -16
+; LA64-NEXT: lu32i.d $a2, 0
+; LA64-NEXT: and $a1, $a1, $a2
+; LA64-NEXT: srli.d $a1, $a1, 16
; LA64-NEXT: slli.d $a0, $a0, 16
; LA64-NEXT: or $a0, $a1, $a0
; LA64-NEXT: ret
@@ -72,7 +84,13 @@ define signext i32 @pick_i32_2_sext(i32 %a, i32 %b) {
;
; LA64-LABEL: pick_i32_2_sext:
; LA64: # %bb.0:
-; LA64-NEXT: bytepick.w $a0, $a1, $a0, 2
+; LA64-NEXT: lu12i.w $a2, -16
+; LA64-NEXT: lu32i.d $a2, 0
+; LA64-NEXT: and $a1, $a1, $a2
+; LA64-NEXT: srli.d $a1, $a1, 16
+; LA64-NEXT: slli.d $a0, $a0, 16
+; LA64-NEXT: or $a0, $a1, $a0
+; LA64-NEXT: addi.w $a0, $a0, 0
; LA64-NEXT: ret
%1 = lshr i32 %b, 16
%2 = shl i32 %a, 16
@@ -91,6 +109,8 @@ define i32 @pick_i32_3(i32 %a, i32 %b) {
; LA64-LABEL: pick_i32_3:
; LA64: # %bb.0:
; LA64-NEXT: bstrpick.d $a1, $a1, 31, 8
+; LA64-NEXT: slli.d $a1, $a1, 8
+; LA64-NEXT: srli.d $a1, $a1, 8
; LA64-NEXT: slli.d $a0, $a0, 24
; LA64-NEXT: or $a0, $a1, $a0
; LA64-NEXT: ret
@@ -110,7 +130,12 @@ define signext i32 @pick_i32_3_sext(i32 %a, i32 %b) {
;
; LA64-LABEL: pick_i32_3_sext:
; LA64: # %bb.0:
-; LA64-NEXT: bytepick.w $a0, $a1, $a0, 3
+; LA64-NEXT: bstrpick.d $a1, $a1, 31, 8
+; LA64-NEXT: slli.d $a1, $a1, 8
+; LA64-NEXT: srli.d $a1, $a1, 8
+; LA64-NEXT: slli.d $a0, $a0, 24
+; LA64-NEXT: or $a0, $a1, $a0
+; LA64-NEXT: addi.w $a0, $a0, 0
; LA64-NEXT: ret
%1 = lshr i32 %b, 8
%2 = shl i32 %a, 24
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index 9fa3f5076bb221..e266bc160db875 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -214,9 +214,10 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
; LA32-NEXT: add.w $a0, $a2, $a0
; LA32-NEXT: srli.w $a1, $a0, 4
; LA32-NEXT: add.w $a0, $a0, $a1
-; LA32-NEXT: bstrpick.w $a1, $a0, 11, 8
-; LA32-NEXT: andi $a0, $a0, 15
-; LA32-NEXT: add.w $a0, $a0, $a1
+; LA32-NEXT: andi $a1, $a0, 15
+; LA32-NEXT: andi $a0, $a0, 3840
+; LA32-NEXT: srli.w $a0, $a0, 8
+; LA32-NEXT: add.w $a0, $a1, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_ctpop_i16:
@@ -234,9 +235,10 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
; LA64-NEXT: add.d $a0, $a2, $a0
; LA64-NEXT: srli.d $a1, $a0, 4
; LA64-NEXT: add.d $a0, $a0, $a1
-; LA64-NEXT: bstrpick.d $a1, $a0, 11, 8
-; LA64-NEXT: andi $a0, $a0, 15
-; LA64-NEXT: add.d $a0, $a0, $a1
+; LA64-NEXT: andi $a1, $a0, 15
+; LA64-NEXT: andi $a0, $a0, 3840
+; LA64-NEXT: srli.d $a0, $a0, 8
+; LA64-NEXT: add.d $a0, $a1, $a0
; LA64-NEXT: ret
%1 = call i16 @llvm.ctpop.i16(i16 %a)
ret i16 %1
@@ -288,7 +290,10 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
; LA64-NEXT: lu12i.w $a1, 4112
; LA64-NEXT: ori $a1, $a1, 257
; LA64-NEXT: mul.d $a0, $a0, $a1
-; LA64-NEXT: bstrpick.d $a0, $a0, 31, 24
+; LA64-NEXT: lu12i.w $a1, -4096
+; LA64-NEXT: lu32i.d $a1, 0
+; LA64-NEXT: and $a0, $a0, $a1
+; LA64-NEXT: srli.d $a0, $a0, 24
; LA64-NEXT: ret
%1 = call i32 @llvm.ctpop.i32(i32 %a)
ret i32 %1
diff --git a/llvm/test/CodeGen/LoongArch/fcopysign.ll b/llvm/test/CodeGen/LoongArch/fcopysign.ll
index 49e8fbca3e12ed..c5871d4e00c1a4 100644
--- a/llvm/test/CodeGen/LoongArch/fcopysign.ll
+++ b/llvm/test/CodeGen/LoongArch/fcopysign.ll
@@ -34,6 +34,7 @@ define float @fcopysign_s(float %a, float %b) nounwind {
define double @fcopysign_d(double %a, double %b) nounwind {
; LA32F-LABEL: fcopysign_d:
; LA32F: # %bb.0:
+; LA32F-NEXT: bstrins.w $a3, $zero, 30, 0
; LA32F-NEXT: srli.w $a2, $a3, 31
; LA32F-NEXT: bstrins.w $a1, $a2, 31, 31
; LA32F-NEXT: ret
@@ -45,6 +46,7 @@ define double @fcopysign_d(double %a, double %b) nounwind {
;
; LA64F-LABEL: fcopysign_d:
; LA64F: # %bb.0:
+; LA64F-NEXT: bstrins.d $a1, $zero, 62, 0
; LA64F-NEXT: srli.d $a1, $a1, 63
; LA64F-NEXT: bstrins.d $a0, $a1, 63, 63
; LA64F-NEXT: ret
@@ -61,6 +63,7 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
; LA32F-LABEL: fold_promote_d_s:
; LA32F: # %bb.0:
; LA32F-NEXT: movfr2gr.s $a2, $fa0
+; LA32F-NEXT: bstrins.w $a2, $zero, 30, 0
; LA32F-NEXT: srli.w $a2, $a2, 31
; LA32F-NEXT: bstrins.w $a1, $a2, 31, 31
; LA32F-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/and.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/and.ll
index b3e32cc5c00c64..f61fdd1d600543 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/and.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/and.ll
@@ -444,8 +444,11 @@ define i32 @and_add_lsr(i32 %x, i32 %y) {
;
; LA64-LABEL: and_add_lsr:
; LA64: # %bb.0:
+; LA64-NEXT: lu12i.w $a2, -256
+; LA64-NEXT: lu32i.d $a2, 0
+; LA64-NEXT: and $a1, $a1, $a2
; LA64-NEXT: addi.d $a0, $a0, -1
-; LA64-NEXT: bstrpick.d $a1, $a1, 31, 20
+; LA64-NEXT: srli.d $a1, $a1, 20
; LA64-NEXT: and $a0, $a1, $a0
; LA64-NEXT: ret
%1 = add i32 %x, 4095
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
index 495974a59ba67d..4823deda6708bb 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
@@ -4,8 +4,9 @@
define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; LA64-LABEL: cmpxchg_i8_acquire_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: ori $a4, $zero, 255
; LA64-NEXT: sll.w $a4, $a4, $a3
; LA64-NEXT: andi $a1, $a1, 255
@@ -33,8 +34,9 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind {
; LA64-LABEL: cmpxchg_i16_acquire_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: lu12i.w $a4, 15
; LA64-NEXT: ori $a4, $a4, 4095
; LA64-NEXT: sll.w $a4, $a4, $a3
@@ -102,8 +104,9 @@ define void @cmpxchg_i64_acquire_acquire(ptr %ptr, i64 %cmp, i64 %val) nounwind
define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; LA64-LABEL: cmpxchg_i8_acquire_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: ori $a4, $zero, 255
; LA64-NEXT: sll.w $a4, $a4, $a3
; LA64-NEXT: andi $a1, $a1, 255
@@ -131,8 +134,9 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwind {
; LA64-LABEL: cmpxchg_i16_acquire_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: lu12i.w $a4, 15
; LA64-NEXT: ori $a4, $a4, 4095
; LA64-NEXT: sll.w $a4, $a4, $a3
@@ -200,8 +204,9 @@ define void @cmpxchg_i64_acquire_monotonic(ptr %ptr, i64 %cmp, i64 %val) nounwin
define i8 @cmpxchg_i8_acquire_acquire_reti8(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; LA64-LABEL: cmpxchg_i8_acquire_acquire_reti8:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: ori $a4, $zero, 255
; LA64-NEXT: sll.w $a4, $a4, $a3
; LA64-NEXT: andi $a1, $a1, 255
@@ -231,8 +236,9 @@ define i8 @cmpxchg_i8_acquire_acquire_reti8(ptr %ptr, i8 %cmp, i8 %val) nounwind
define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nounwind {
; LA64-LABEL: cmpxchg_i16_acquire_acquire_reti16:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: lu12i.w $a4, 15
; LA64-NEXT: ori $a4, $a4, 4095
; LA64-NEXT: sll.w $a4, $a4, $a3
@@ -306,8 +312,9 @@ define i64 @cmpxchg_i64_acquire_acquire_reti64(ptr %ptr, i64 %cmp, i64 %val) nou
define i1 @cmpxchg_i8_acquire_acquire_reti1(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; LA64-LABEL: cmpxchg_i8_acquire_acquire_reti1:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: ori $a4, $zero, 255
; LA64-NEXT: sll.w $a4, $a4, $a3
; LA64-NEXT: andi $a1, $a1, 255
@@ -339,8 +346,9 @@ define i1 @cmpxchg_i8_acquire_acquire_reti1(ptr %ptr, i8 %cmp, i8 %val) nounwind
define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounwind {
; LA64-LABEL: cmpxchg_i16_acquire_acquire_reti1:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: lu12i.w $a4, 15
; LA64-NEXT: ori $a4, $a4, 4095
; LA64-NEXT: sll.w $a4, $a4, $a3
@@ -418,8 +426,9 @@ define i1 @cmpxchg_i64_acquire_acquire_reti1(ptr %ptr, i64 %cmp, i64 %val) nounw
define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; LA64-LABEL: cmpxchg_i8_monotonic_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: ori $a4, $zero, 255
; LA64-NEXT: sll.w $a4, $a4, $a3
; LA64-NEXT: andi $a1, $a1, 255
@@ -447,8 +456,9 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwind {
; LA64-LABEL: cmpxchg_i16_monotonic_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: lu12i.w $a4, 15
; LA64-NEXT: ori $a4, $a4, 4095
; LA64-NEXT: sll.w $a4, $a4, $a3
@@ -516,8 +526,9 @@ define void @cmpxchg_i64_monotonic_monotonic(ptr %ptr, i64 %cmp, i64 %val) nounw
define i8 @cmpxchg_i8_monotonic_monotonic_reti8(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; LA64-LABEL: cmpxchg_i8_monotonic_monotonic_reti8:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: ori $a4, $zero, 255
; LA64-NEXT: sll.w $a4, $a4, $a3
; LA64-NEXT: andi $a1, $a1, 255
@@ -547,8 +558,9 @@ define i8 @cmpxchg_i8_monotonic_monotonic_reti8(ptr %ptr, i8 %cmp, i8 %val) noun
define i16 @cmpxchg_i16_monotonic_monotonic_reti16(ptr %ptr, i16 %cmp, i16 %val) nounwind {
; LA64-LABEL: cmpxchg_i16_monotonic_monotonic_reti16:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: lu12i.w $a4, 15
; LA64-NEXT: ori $a4, $a4, 4095
; LA64-NEXT: sll.w $a4, $a4, $a3
@@ -622,8 +634,9 @@ define i64 @cmpxchg_i64_monotonic_monotonic_reti64(ptr %ptr, i64 %cmp, i64 %val)
define i1 @cmpxchg_i8_monotonic_monotonic_reti1(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; LA64-LABEL: cmpxchg_i8_monotonic_monotonic_reti1:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: ori $a4, $zero, 255
; LA64-NEXT: sll.w $a4, $a4, $a3
; LA64-NEXT: andi $a1, $a1, 255
@@ -655,8 +668,9 @@ define i1 @cmpxchg_i8_monotonic_monotonic_reti1(ptr %ptr, i8 %cmp, i8 %val) noun
define i1 @cmpxchg_i16_monotonic_monotonic_reti1(ptr %ptr, i16 %cmp, i16 %val) nounwind {
; LA64-LABEL: cmpxchg_i16_monotonic_monotonic_reti1:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a3, $a0, 3
+; LA64-NEXT: andi $a3, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a3, $a3, 3
; LA64-NEXT: lu12i.w $a4, 15
; LA64-NEXT: ori $a4, $a4, 4095
; LA64-NEXT: sll.w $a4, $a4, $a3
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
index 794242f45fdb8c..954f73ce186fef 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
@@ -7,8 +7,9 @@
define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_umax_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -35,8 +36,9 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_umax_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -84,8 +86,9 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_umin_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -112,8 +115,9 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_umin_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -161,24 +165,24 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_max_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: ori $a4, $zero, 255
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.b $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: xori $a3, $a3, 56
+; LA64-NEXT: xori $a4, $a2, 56
; LA64-NEXT: .LBB8_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a7, $a1, .LBB8_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB8_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB8_3: # in Loop: Header=BB8_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -193,26 +197,26 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_max_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.h $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: ori $a5, $zero, 48
-; LA64-NEXT: sub.d $a3, $a5, $a3
+; LA64-NEXT: ori $a4, $zero, 48
+; LA64-NEXT: sub.d $a4, $a4, $a2
; LA64-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a7, $a1, .LBB9_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB9_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB9_3: # in Loop: Header=BB9_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -247,24 +251,24 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_min_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: ori $a4, $zero, 255
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.b $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: xori $a3, $a3, 56
+; LA64-NEXT: xori $a4, $a2, 56
; LA64-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a1, $a7, .LBB12_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB12_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB12_3: # in Loop: Header=BB12_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -279,26 +283,26 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_min_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.h $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: ori $a5, $zero, 48
-; LA64-NEXT: sub.d $a3, $a5, $a3
+; LA64-NEXT: ori $a4, $zero, 48
+; LA64-NEXT: sub.d $a4, $a4, $a2
; LA64-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a1, $a7, .LBB13_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB13_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB13_3: # in Loop: Header=BB13_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -333,8 +337,9 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_umax_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -361,8 +366,9 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_umax_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -410,8 +416,9 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_umin_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -438,8 +445,9 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_umin_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -487,24 +495,24 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_max_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: ori $a4, $zero, 255
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.b $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: xori $a3, $a3, 56
+; LA64-NEXT: xori $a4, $a2, 56
; LA64-NEXT: .LBB24_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a7, $a1, .LBB24_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB24_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB24_3: # in Loop: Header=BB24_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -519,26 +527,26 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_max_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.h $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: ori $a5, $zero, 48
-; LA64-NEXT: sub.d $a3, $a5, $a3
+; LA64-NEXT: ori $a4, $zero, 48
+; LA64-NEXT: sub.d $a4, $a4, $a2
; LA64-NEXT: .LBB25_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a7, $a1, .LBB25_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB25_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB25_3: # in Loop: Header=BB25_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -573,24 +581,24 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_min_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: ori $a4, $zero, 255
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.b $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: xori $a3, $a3, 56
+; LA64-NEXT: xori $a4, $a2, 56
; LA64-NEXT: .LBB28_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a1, $a7, .LBB28_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB28_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB28_3: # in Loop: Header=BB28_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -605,26 +613,26 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_min_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.h $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: ori $a5, $zero, 48
-; LA64-NEXT: sub.d $a3, $a5, $a3
+; LA64-NEXT: ori $a4, $zero, 48
+; LA64-NEXT: sub.d $a4, $a4, $a2
; LA64-NEXT: .LBB29_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a1, $a7, .LBB29_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB29_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB29_3: # in Loop: Header=BB29_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -659,8 +667,9 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_umax_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -687,8 +696,9 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_umax_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -736,8 +746,9 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_umin_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -764,8 +775,9 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_umin_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -813,24 +825,24 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_max_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: ori $a4, $zero, 255
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.b $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: xori $a3, $a3, 56
+; LA64-NEXT: xori $a4, $a2, 56
; LA64-NEXT: .LBB40_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a7, $a1, .LBB40_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB40_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB40_3: # in Loop: Header=BB40_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -845,26 +857,26 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_max_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.h $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: ori $a5, $zero, 48
-; LA64-NEXT: sub.d $a3, $a5, $a3
+; LA64-NEXT: ori $a4, $zero, 48
+; LA64-NEXT: sub.d $a4, $a4, $a2
; LA64-NEXT: .LBB41_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a7, $a1, .LBB41_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB41_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB41_3: # in Loop: Header=BB41_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -899,24 +911,24 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_min_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: ori $a4, $zero, 255
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.b $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: xori $a3, $a3, 56
+; LA64-NEXT: xori $a4, $a2, 56
; LA64-NEXT: .LBB44_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a1, $a7, .LBB44_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB44_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB44_3: # in Loop: Header=BB44_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -931,26 +943,26 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_min_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.h $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: ori $a5, $zero, 48
-; LA64-NEXT: sub.d $a3, $a5, $a3
+; LA64-NEXT: ori $a4, $zero, 48
+; LA64-NEXT: sub.d $a4, $a4, $a2
; LA64-NEXT: .LBB45_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a1, $a7, .LBB45_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB45_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB45_3: # in Loop: Header=BB45_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -985,8 +997,9 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_umax_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -1013,8 +1026,9 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_umax_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -1062,8 +1076,9 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_umin_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -1090,8 +1105,9 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_umin_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -1139,24 +1155,24 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_max_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: ori $a4, $zero, 255
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.b $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: xori $a3, $a3, 56
+; LA64-NEXT: xori $a4, $a2, 56
; LA64-NEXT: .LBB56_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a7, $a1, .LBB56_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB56_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB56_3: # in Loop: Header=BB56_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -1171,26 +1187,26 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_max_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.h $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: ori $a5, $zero, 48
-; LA64-NEXT: sub.d $a3, $a5, $a3
+; LA64-NEXT: ori $a4, $zero, 48
+; LA64-NEXT: sub.d $a4, $a4, $a2
; LA64-NEXT: .LBB57_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a7, $a1, .LBB57_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB57_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB57_3: # in Loop: Header=BB57_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -1225,24 +1241,24 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_min_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: ori $a4, $zero, 255
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.b $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: xori $a3, $a3, 56
+; LA64-NEXT: xori $a4, $a2, 56
; LA64-NEXT: .LBB60_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a1, $a7, .LBB60_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB60_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB60_3: # in Loop: Header=BB60_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -1257,26 +1273,26 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_min_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.h $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: ori $a5, $zero, 48
-; LA64-NEXT: sub.d $a3, $a5, $a3
+; LA64-NEXT: ori $a4, $zero, 48
+; LA64-NEXT: sub.d $a4, $a4, $a2
; LA64-NEXT: .LBB61_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a1, $a7, .LBB61_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB61_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB61_3: # in Loop: Header=BB61_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -1311,8 +1327,9 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_umax_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -1339,8 +1356,9 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_umax_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -1388,8 +1406,9 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_umin_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -1416,8 +1435,9 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_umin_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -1465,24 +1485,24 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_max_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: ori $a4, $zero, 255
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.b $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: xori $a3, $a3, 56
+; LA64-NEXT: xori $a4, $a2, 56
; LA64-NEXT: .LBB72_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a7, $a1, .LBB72_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB72_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB72_3: # in Loop: Header=BB72_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -1497,26 +1517,26 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_max_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.h $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: ori $a5, $zero, 48
-; LA64-NEXT: sub.d $a3, $a5, $a3
+; LA64-NEXT: ori $a4, $zero, 48
+; LA64-NEXT: sub.d $a4, $a4, $a2
; LA64-NEXT: .LBB73_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a7, $a1, .LBB73_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB73_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB73_3: # in Loop: Header=BB73_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -1551,24 +1571,24 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA64-LABEL: atomicrmw_min_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: ori $a4, $zero, 255
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: ori $a3, $zero, 255
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.b $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: xori $a3, $a3, 56
+; LA64-NEXT: xori $a4, $a2, 56
; LA64-NEXT: .LBB76_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a1, $a7, .LBB76_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB76_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB76_3: # in Loop: Header=BB76_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
@@ -1583,26 +1603,26 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA64-LABEL: atomicrmw_min_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT: andi $a3, $a2, 24
-; LA64-NEXT: lu12i.w $a4, 15
-; LA64-NEXT: ori $a4, $a4, 4095
-; LA64-NEXT: sll.w $a4, $a4, $a2
+; LA64-NEXT: slli.d $a2, $a2, 3
+; LA64-NEXT: lu12i.w $a3, 15
+; LA64-NEXT: ori $a3, $a3, 4095
+; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: ext.w.h $a1, $a1
; LA64-NEXT: sll.w $a1, $a1, $a2
-; LA64-NEXT: ori $a5, $zero, 48
-; LA64-NEXT: sub.d $a3, $a5, $a3
+; LA64-NEXT: ori $a4, $zero, 48
+; LA64-NEXT: sub.d $a4, $a4, $a2
; LA64-NEXT: .LBB77_1: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ll.w $a5, $a0, 0
-; LA64-NEXT: and $a7, $a5, $a4
+; LA64-NEXT: and $a7, $a5, $a3
; LA64-NEXT: move $a6, $a5
-; LA64-NEXT: sll.w $a7, $a7, $a3
-; LA64-NEXT: sra.w $a7, $a7, $a3
+; LA64-NEXT: sll.w $a7, $a7, $a4
+; LA64-NEXT: sra.w $a7, $a7, $a4
; LA64-NEXT: bge $a1, $a7, .LBB77_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB77_1 Depth=1
; LA64-NEXT: xor $a6, $a5, $a1
-; LA64-NEXT: and $a6, $a6, $a4
+; LA64-NEXT: and $a6, $a6, $a3
; LA64-NEXT: xor $a6, $a5, $a6
; LA64-NEXT: .LBB77_3: # in Loop: Header=BB77_1 Depth=1
; LA64-NEXT: sc.w $a6, $a0, 0
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
index 9b83b4c9535ee9..3725c9e4e6d472 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
@@ -5,8 +5,9 @@
define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_xchg_i8_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -25,8 +26,9 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -49,8 +51,9 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_0_i8_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: ori $a2, $zero, 255
; LA32-NEXT: sll.w $a2, $a2, $a1
; LA32-NEXT: nor $a2, $a2, $zero
@@ -65,8 +68,9 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_0_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: ori $a2, $zero, 255
; LA64-NEXT: sll.w $a2, $a2, $a1
; LA64-NEXT: nor $a2, $a2, $zero
@@ -80,8 +84,9 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: ori $a2, $zero, 255
; LA32-NEXT: sll.w $a2, $a2, $a1
; LA32-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
@@ -95,8 +100,9 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: ori $a2, $zero, 255
; LA64-NEXT: sll.w $a2, $a2, $a1
; LA64-NEXT: amor_db.w $a3, $a2, $a0
@@ -109,8 +115,9 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_xchg_i16_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -130,8 +137,9 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -155,8 +163,9 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_0_i16_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: lu12i.w $a2, 15
; LA32-NEXT: ori $a2, $a2, 4095
; LA32-NEXT: sll.w $a2, $a2, $a1
@@ -172,8 +181,9 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_0_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: lu12i.w $a2, 15
; LA64-NEXT: ori $a2, $a2, 4095
; LA64-NEXT: sll.w $a2, $a2, $a1
@@ -188,8 +198,9 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: lu12i.w $a2, 15
; LA32-NEXT: ori $a2, $a2, 4095
; LA32-NEXT: sll.w $a2, $a2, $a1
@@ -204,8 +215,9 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: lu12i.w $a2, 15
; LA64-NEXT: ori $a2, $a2, 4095
; LA64-NEXT: sll.w $a2, $a2, $a1
@@ -260,8 +272,9 @@ define i64 @atomicrmw_xchg_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_add_i8_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -280,8 +293,9 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_add_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -304,8 +318,9 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_add_i16_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -325,8 +340,9 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_add_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -391,8 +407,9 @@ define i64 @atomicrmw_add_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_sub_i8_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -411,8 +428,9 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_sub_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -435,8 +453,9 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_sub_i16_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -456,8 +475,9 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_sub_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -524,8 +544,9 @@ define i64 @atomicrmw_sub_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_nand_i8_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -545,8 +566,9 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_nand_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -570,8 +592,9 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_nand_i16_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -592,8 +615,9 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_nand_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -672,8 +696,9 @@ define i64 @atomicrmw_nand_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_and_i8_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -690,8 +715,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_and_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -707,8 +733,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_and_i16_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -726,8 +753,9 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_and_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -785,8 +813,9 @@ define i64 @atomicrmw_and_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_or_i8_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: andi $a1, $a1, 255
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB24_1: # =>This Inner Loop Header: Depth=1
@@ -800,8 +829,9 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_or_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amor_db.w $a3, $a1, $a0
@@ -814,8 +844,9 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_or_i16_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB25_1: # =>This Inner Loop Header: Depth=1
@@ -829,8 +860,9 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_or_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amor_db.w $a3, $a1, $a0
@@ -884,8 +916,9 @@ define i64 @atomicrmw_or_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_xor_i8_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: andi $a1, $a1, 255
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB28_1: # =>This Inner Loop Header: Depth=1
@@ -899,8 +932,9 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xor_i8_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amxor_db.w $a3, $a1, $a0
@@ -913,8 +947,9 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_xor_i16_acquire:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB29_1: # =>This Inner Loop Header: Depth=1
@@ -928,8 +963,9 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xor_i16_acquire:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amxor_db.w $a3, $a1, $a0
@@ -983,8 +1019,9 @@ define i64 @atomicrmw_xor_i64_acquire(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_xchg_i8_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -1003,8 +1040,9 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -1027,8 +1065,9 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_0_i8_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: ori $a2, $zero, 255
; LA32-NEXT: sll.w $a2, $a2, $a1
; LA32-NEXT: nor $a2, $a2, $zero
@@ -1043,8 +1082,9 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_0_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: ori $a2, $zero, 255
; LA64-NEXT: sll.w $a2, $a2, $a1
; LA64-NEXT: nor $a2, $a2, $zero
@@ -1058,8 +1098,9 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_minus_1_i8_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: ori $a2, $zero, 255
; LA32-NEXT: sll.w $a2, $a2, $a1
; LA32-NEXT: .LBB34_1: # =>This Inner Loop Header: Depth=1
@@ -1073,8 +1114,9 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_minus_1_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: ori $a2, $zero, 255
; LA64-NEXT: sll.w $a2, $a2, $a1
; LA64-NEXT: amor_db.w $a3, $a2, $a0
@@ -1087,8 +1129,9 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_xchg_i16_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -1108,8 +1151,9 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -1133,8 +1177,9 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_0_i16_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: lu12i.w $a2, 15
; LA32-NEXT: ori $a2, $a2, 4095
; LA32-NEXT: sll.w $a2, $a2, $a1
@@ -1150,8 +1195,9 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_0_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: lu12i.w $a2, 15
; LA64-NEXT: ori $a2, $a2, 4095
; LA64-NEXT: sll.w $a2, $a2, $a1
@@ -1166,8 +1212,9 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_minus_1_i16_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: lu12i.w $a2, 15
; LA32-NEXT: ori $a2, $a2, 4095
; LA32-NEXT: sll.w $a2, $a2, $a1
@@ -1182,8 +1229,9 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_minus_1_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: lu12i.w $a2, 15
; LA64-NEXT: ori $a2, $a2, 4095
; LA64-NEXT: sll.w $a2, $a2, $a1
@@ -1238,8 +1286,9 @@ define i64 @atomicrmw_xchg_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_add_i8_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -1258,8 +1307,9 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_add_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -1282,8 +1332,9 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_add_i16_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -1303,8 +1354,9 @@ define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_add_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -1369,8 +1421,9 @@ define i64 @atomicrmw_add_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_sub_i8_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -1389,8 +1442,9 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_sub_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -1413,8 +1467,9 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_sub_i16_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -1434,8 +1489,9 @@ define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_sub_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -1502,8 +1558,9 @@ define i64 @atomicrmw_sub_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_nand_i8_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -1523,8 +1580,9 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_nand_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -1548,8 +1606,9 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_nand_i16_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -1570,8 +1629,9 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_nand_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -1650,8 +1710,9 @@ define i64 @atomicrmw_nand_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_and_i8_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -1668,8 +1729,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_and_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -1685,8 +1747,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_and_i16_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -1704,8 +1767,9 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_and_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -1763,8 +1827,9 @@ define i64 @atomicrmw_and_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_or_i8_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: andi $a1, $a1, 255
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB56_1: # =>This Inner Loop Header: Depth=1
@@ -1778,8 +1843,9 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_or_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amor_db.w $a3, $a1, $a0
@@ -1792,8 +1858,9 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_or_i16_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB57_1: # =>This Inner Loop Header: Depth=1
@@ -1807,8 +1874,9 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_or_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amor_db.w $a3, $a1, $a0
@@ -1862,8 +1930,9 @@ define i64 @atomicrmw_or_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_xor_i8_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: andi $a1, $a1, 255
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB60_1: # =>This Inner Loop Header: Depth=1
@@ -1877,8 +1946,9 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xor_i8_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amxor_db.w $a3, $a1, $a0
@@ -1891,8 +1961,9 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_xor_i16_release:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB61_1: # =>This Inner Loop Header: Depth=1
@@ -1906,8 +1977,9 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xor_i16_release:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amxor_db.w $a3, $a1, $a0
@@ -1961,8 +2033,9 @@ define i64 @atomicrmw_xor_i64_release(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_xchg_i8_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -1981,8 +2054,9 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -2005,8 +2079,9 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_0_i8_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: ori $a2, $zero, 255
; LA32-NEXT: sll.w $a2, $a2, $a1
; LA32-NEXT: nor $a2, $a2, $zero
@@ -2021,8 +2096,9 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_0_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: ori $a2, $zero, 255
; LA64-NEXT: sll.w $a2, $a2, $a1
; LA64-NEXT: nor $a2, $a2, $zero
@@ -2036,8 +2112,9 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: ori $a2, $zero, 255
; LA32-NEXT: sll.w $a2, $a2, $a1
; LA32-NEXT: .LBB66_1: # =>This Inner Loop Header: Depth=1
@@ -2051,8 +2128,9 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: ori $a2, $zero, 255
; LA64-NEXT: sll.w $a2, $a2, $a1
; LA64-NEXT: amor_db.w $a3, $a2, $a0
@@ -2065,8 +2143,9 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_xchg_i16_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -2086,8 +2165,9 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -2111,8 +2191,9 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_0_i16_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: lu12i.w $a2, 15
; LA32-NEXT: ori $a2, $a2, 4095
; LA32-NEXT: sll.w $a2, $a2, $a1
@@ -2128,8 +2209,9 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_0_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: lu12i.w $a2, 15
; LA64-NEXT: ori $a2, $a2, 4095
; LA64-NEXT: sll.w $a2, $a2, $a1
@@ -2144,8 +2226,9 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: lu12i.w $a2, 15
; LA32-NEXT: ori $a2, $a2, 4095
; LA32-NEXT: sll.w $a2, $a2, $a1
@@ -2160,8 +2243,9 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: lu12i.w $a2, 15
; LA64-NEXT: ori $a2, $a2, 4095
; LA64-NEXT: sll.w $a2, $a2, $a1
@@ -2216,8 +2300,9 @@ define i64 @atomicrmw_xchg_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_add_i8_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -2236,8 +2321,9 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_add_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -2260,8 +2346,9 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_add_i16_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -2281,8 +2368,9 @@ define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_add_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -2347,8 +2435,9 @@ define i64 @atomicrmw_add_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_sub_i8_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -2367,8 +2456,9 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_sub_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -2391,8 +2481,9 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_sub_i16_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -2412,8 +2503,9 @@ define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_sub_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -2480,8 +2572,9 @@ define i64 @atomicrmw_sub_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_nand_i8_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -2501,8 +2594,9 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_nand_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -2526,8 +2620,9 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_nand_i16_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -2548,8 +2643,9 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_nand_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -2628,8 +2724,9 @@ define i64 @atomicrmw_nand_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_and_i8_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -2646,8 +2743,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_and_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -2663,8 +2761,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_and_i16_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -2682,8 +2781,9 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_and_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -2741,8 +2841,9 @@ define i64 @atomicrmw_and_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_or_i8_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: andi $a1, $a1, 255
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB88_1: # =>This Inner Loop Header: Depth=1
@@ -2756,8 +2857,9 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_or_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amor_db.w $a3, $a1, $a0
@@ -2770,8 +2872,9 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_or_i16_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB89_1: # =>This Inner Loop Header: Depth=1
@@ -2785,8 +2888,9 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_or_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amor_db.w $a3, $a1, $a0
@@ -2840,8 +2944,9 @@ define i64 @atomicrmw_or_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_xor_i8_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: andi $a1, $a1, 255
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB92_1: # =>This Inner Loop Header: Depth=1
@@ -2855,8 +2960,9 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xor_i8_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amxor_db.w $a3, $a1, $a0
@@ -2869,8 +2975,9 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_xor_i16_acq_rel:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB93_1: # =>This Inner Loop Header: Depth=1
@@ -2884,8 +2991,9 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xor_i16_acq_rel:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amxor_db.w $a3, $a1, $a0
@@ -2939,8 +3047,9 @@ define i64 @atomicrmw_xor_i64_acq_rel(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_xchg_i8_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -2959,8 +3068,9 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -2983,8 +3093,9 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_0_i8_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: ori $a2, $zero, 255
; LA32-NEXT: sll.w $a2, $a2, $a1
; LA32-NEXT: nor $a2, $a2, $zero
@@ -2999,8 +3110,9 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_0_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: ori $a2, $zero, 255
; LA64-NEXT: sll.w $a2, $a2, $a1
; LA64-NEXT: nor $a2, $a2, $zero
@@ -3014,8 +3126,9 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: ori $a2, $zero, 255
; LA32-NEXT: sll.w $a2, $a2, $a1
; LA32-NEXT: .LBB98_1: # =>This Inner Loop Header: Depth=1
@@ -3029,8 +3142,9 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: ori $a2, $zero, 255
; LA64-NEXT: sll.w $a2, $a2, $a1
; LA64-NEXT: amor_db.w $a3, $a2, $a0
@@ -3043,8 +3157,9 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
define i16 @atomicrmw_xchg_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_xchg_i16_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -3064,8 +3179,9 @@ define i16 @atomicrmw_xchg_i16_seq_cst(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -3089,8 +3205,9 @@ define i16 @atomicrmw_xchg_i16_seq_cst(ptr %a, i16 %b) nounwind {
define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_0_i16_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: lu12i.w $a2, 15
; LA32-NEXT: ori $a2, $a2, 4095
; LA32-NEXT: sll.w $a2, $a2, $a1
@@ -3106,8 +3223,9 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_0_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: lu12i.w $a2, 15
; LA64-NEXT: ori $a2, $a2, 4095
; LA64-NEXT: sll.w $a2, $a2, $a1
@@ -3122,8 +3240,9 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: lu12i.w $a2, 15
; LA32-NEXT: ori $a2, $a2, 4095
; LA32-NEXT: sll.w $a2, $a2, $a1
@@ -3138,8 +3257,9 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: lu12i.w $a2, 15
; LA64-NEXT: ori $a2, $a2, 4095
; LA64-NEXT: sll.w $a2, $a2, $a1
@@ -3194,8 +3314,9 @@ define i64 @atomicrmw_xchg_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_add_i8_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -3214,8 +3335,9 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_add_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -3238,8 +3360,9 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_add_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_add_i16_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -3259,8 +3382,9 @@ define i16 @atomicrmw_add_i16_seq_cst(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_add_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -3325,8 +3449,9 @@ define i64 @atomicrmw_add_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_sub_i8_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -3345,8 +3470,9 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_sub_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -3369,8 +3495,9 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_sub_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_sub_i16_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -3390,8 +3517,9 @@ define i16 @atomicrmw_sub_i16_seq_cst(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_sub_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -3458,8 +3586,9 @@ define i64 @atomicrmw_sub_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_nand_i8_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -3479,8 +3608,9 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_nand_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -3504,8 +3634,9 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_nand_i16_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -3526,8 +3657,9 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_nand_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -3606,8 +3738,9 @@ define i64 @atomicrmw_nand_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_and_i8_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -3624,8 +3757,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_and_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -3641,8 +3775,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_and_i16_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -3660,8 +3795,9 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_and_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -3719,8 +3855,9 @@ define i64 @atomicrmw_and_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_or_i8_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: andi $a1, $a1, 255
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB120_1: # =>This Inner Loop Header: Depth=1
@@ -3734,8 +3871,9 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_or_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amor_db.w $a3, $a1, $a0
@@ -3748,8 +3886,9 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_or_i16_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB121_1: # =>This Inner Loop Header: Depth=1
@@ -3763,8 +3902,9 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_or_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amor_db.w $a3, $a1, $a0
@@ -3818,8 +3958,9 @@ define i64 @atomicrmw_or_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_xor_i8_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: andi $a1, $a1, 255
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB124_1: # =>This Inner Loop Header: Depth=1
@@ -3833,8 +3974,9 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xor_i8_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amxor_db.w $a3, $a1, $a0
@@ -3847,8 +3989,9 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_xor_i16_seq_cst:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB125_1: # =>This Inner Loop Header: Depth=1
@@ -3862,8 +4005,9 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xor_i16_seq_cst:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amxor_db.w $a3, $a1, $a0
@@ -3917,8 +4061,9 @@ define i64 @atomicrmw_xor_i64_seq_cst(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_xchg_i8_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -3937,8 +4082,9 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -3961,8 +4107,9 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
define i8 @atomicrmw_xchg_0_i8_monotonic(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_0_i8_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: ori $a2, $zero, 255
; LA32-NEXT: sll.w $a2, $a2, $a1
; LA32-NEXT: nor $a2, $a2, $zero
@@ -3977,8 +4124,9 @@ define i8 @atomicrmw_xchg_0_i8_monotonic(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_0_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: ori $a2, $zero, 255
; LA64-NEXT: sll.w $a2, $a2, $a1
; LA64-NEXT: nor $a2, $a2, $zero
@@ -3992,8 +4140,9 @@ define i8 @atomicrmw_xchg_0_i8_monotonic(ptr %a) nounwind {
define i8 @atomicrmw_xchg_minus_1_i8_monotonic(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_minus_1_i8_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: ori $a2, $zero, 255
; LA32-NEXT: sll.w $a2, $a2, $a1
; LA32-NEXT: .LBB130_1: # =>This Inner Loop Header: Depth=1
@@ -4007,8 +4156,9 @@ define i8 @atomicrmw_xchg_minus_1_i8_monotonic(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_minus_1_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: ori $a2, $zero, 255
; LA64-NEXT: sll.w $a2, $a2, $a1
; LA64-NEXT: amor_db.w $a3, $a2, $a0
@@ -4021,8 +4171,9 @@ define i8 @atomicrmw_xchg_minus_1_i8_monotonic(ptr %a) nounwind {
define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_xchg_i16_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -4042,8 +4193,9 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -4067,8 +4219,9 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
define i16 @atomicrmw_xchg_0_i16_monotonic(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_0_i16_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: lu12i.w $a2, 15
; LA32-NEXT: ori $a2, $a2, 4095
; LA32-NEXT: sll.w $a2, $a2, $a1
@@ -4084,8 +4237,9 @@ define i16 @atomicrmw_xchg_0_i16_monotonic(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_0_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: lu12i.w $a2, 15
; LA64-NEXT: ori $a2, $a2, 4095
; LA64-NEXT: sll.w $a2, $a2, $a1
@@ -4100,8 +4254,9 @@ define i16 @atomicrmw_xchg_0_i16_monotonic(ptr %a) nounwind {
define i16 @atomicrmw_xchg_minus_1_i16_monotonic(ptr %a) nounwind {
; LA32-LABEL: atomicrmw_xchg_minus_1_i16_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a1, $a0, 3
+; LA32-NEXT: andi $a1, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a1, $a1, 3
; LA32-NEXT: lu12i.w $a2, 15
; LA32-NEXT: ori $a2, $a2, 4095
; LA32-NEXT: sll.w $a2, $a2, $a1
@@ -4116,8 +4271,9 @@ define i16 @atomicrmw_xchg_minus_1_i16_monotonic(ptr %a) nounwind {
;
; LA64-LABEL: atomicrmw_xchg_minus_1_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 3
+; LA64-NEXT: andi $a1, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a1, $a1, 3
; LA64-NEXT: lu12i.w $a2, 15
; LA64-NEXT: ori $a2, $a2, 4095
; LA64-NEXT: sll.w $a2, $a2, $a1
@@ -4172,8 +4328,9 @@ define i64 @atomicrmw_xchg_i64_monotonic(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_add_i8_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -4192,8 +4349,9 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_add_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -4216,8 +4374,9 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_add_i16_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -4237,8 +4396,9 @@ define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_add_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -4303,8 +4463,9 @@ define i64 @atomicrmw_add_i64_monotonic(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_sub_i8_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -4323,8 +4484,9 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_sub_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -4347,8 +4509,9 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_sub_i16_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -4368,8 +4531,9 @@ define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_sub_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -4436,8 +4600,9 @@ define i64 @atomicrmw_sub_i64_monotonic(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_nand_i8_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -4457,8 +4622,9 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_nand_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -4482,8 +4648,9 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_nand_i16_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -4504,8 +4671,9 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_nand_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -4584,8 +4752,9 @@ define i64 @atomicrmw_nand_i64_monotonic(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_and_i8_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: ori $a3, $zero, 255
; LA32-NEXT: sll.w $a3, $a3, $a2
; LA32-NEXT: andi $a1, $a1, 255
@@ -4602,8 +4771,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_and_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: ori $a3, $zero, 255
; LA64-NEXT: sll.w $a3, $a3, $a2
; LA64-NEXT: andi $a1, $a1, 255
@@ -4619,8 +4789,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_and_i16_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: lu12i.w $a3, 15
; LA32-NEXT: ori $a3, $a3, 4095
; LA32-NEXT: sll.w $a3, $a3, $a2
@@ -4638,8 +4809,9 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_and_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: lu12i.w $a3, 15
; LA64-NEXT: ori $a3, $a3, 4095
; LA64-NEXT: sll.w $a3, $a3, $a2
@@ -4697,8 +4869,9 @@ define i64 @atomicrmw_and_i64_monotonic(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_or_i8_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: andi $a1, $a1, 255
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB152_1: # =>This Inner Loop Header: Depth=1
@@ -4712,8 +4885,9 @@ define i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_or_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amor_db.w $a3, $a1, $a0
@@ -4726,8 +4900,9 @@ define i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_or_i16_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB153_1: # =>This Inner Loop Header: Depth=1
@@ -4741,8 +4916,9 @@ define i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_or_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amor_db.w $a3, $a1, $a0
@@ -4796,8 +4972,9 @@ define i64 @atomicrmw_or_i64_monotonic(ptr %a, i64 %b) nounwind {
define i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
; LA32-LABEL: atomicrmw_xor_i8_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: andi $a1, $a1, 255
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB156_1: # =>This Inner Loop Header: Depth=1
@@ -4811,8 +4988,9 @@ define i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xor_i8_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: andi $a1, $a1, 255
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amxor_db.w $a3, $a1, $a0
@@ -4825,8 +5003,9 @@ define i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
define i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
; LA32-LABEL: atomicrmw_xor_i16_monotonic:
; LA32: # %bb.0:
-; LA32-NEXT: slli.w $a2, $a0, 3
+; LA32-NEXT: andi $a2, $a0, 3
; LA32-NEXT: bstrins.w $a0, $zero, 1, 0
+; LA32-NEXT: slli.w $a2, $a2, 3
; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0
; LA32-NEXT: sll.w $a1, $a1, $a2
; LA32-NEXT: .LBB157_1: # =>This Inner Loop Header: Depth=1
@@ -4840,8 +5019,9 @@ define i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
;
; LA64-LABEL: atomicrmw_xor_i16_monotonic:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a2, $a0, 3
+; LA64-NEXT: andi $a2, $a0, 3
; LA64-NEXT: bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT: slli.d $a2, $a2, 3
; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0
; LA64-NEXT: sll.w $a1, $a1, $a2
; LA64-NEXT: amxor_db.w $a3, $a1, $a0
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/lshr.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/lshr.ll
index 7b28872780e824..9372644050fd6f 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/lshr.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/lshr.ll
@@ -104,12 +104,14 @@ define i1 @lshr_i1_3(i1 %x) {
define i8 @lshr_i8_3(i8 %x) {
; LA32-LABEL: lshr_i8_3:
; LA32: # %bb.0:
-; LA32-NEXT: bstrpick.w $a0, $a0, 7, 3
+; LA32-NEXT: andi $a0, $a0, 248
+; LA32-NEXT: srli.w $a0, $a0, 3
; LA32-NEXT: ret
;
; LA64-LABEL: lshr_i8_3:
; LA64: # %bb.0:
-; LA64-NEXT: bstrpick.d $a0, $a0, 7, 3
+; LA64-NEXT: andi $a0, $a0, 248
+; LA64-NEXT: srli.d $a0, $a0, 3
; LA64-NEXT: ret
%lshr = lshr i8 %x, 3
ret i8 %lshr
@@ -119,11 +121,15 @@ define i16 @lshr_i16_3(i16 %x) {
; LA32-LABEL: lshr_i16_3:
; LA32: # %bb.0:
; LA32-NEXT: bstrpick.w $a0, $a0, 15, 3
+; LA32-NEXT: slli.w $a0, $a0, 3
+; LA32-NEXT: srli.w $a0, $a0, 3
; LA32-NEXT: ret
;
; LA64-LABEL: lshr_i16_3:
; LA64: # %bb.0:
; LA64-NEXT: bstrpick.d $a0, $a0, 15, 3
+; LA64-NEXT: slli.d $a0, $a0, 3
+; LA64-NEXT: srli.d $a0, $a0, 3
; LA64-NEXT: ret
%lshr = lshr i16 %x, 3
ret i16 %lshr
@@ -138,6 +144,8 @@ define i32 @lshr_i32_3(i32 %x) {
; LA64-LABEL: lshr_i32_3:
; LA64: # %bb.0:
; LA64-NEXT: bstrpick.d $a0, $a0, 31, 3
+; LA64-NEXT: slli.d $a0, $a0, 3
+; LA64-NEXT: srli.d $a0, $a0, 3
; LA64-NEXT: ret
%lshr = lshr i32 %x, 3
ret i32 %lshr
diff --git a/llvm/test/CodeGen/LoongArch/legalicmpimm.ll b/llvm/test/CodeGen/LoongArch/legalicmpimm.ll
index 3dc8785631dc2c..6f550e5ac389c6 100644
--- a/llvm/test/CodeGen/LoongArch/legalicmpimm.ll
+++ b/llvm/test/CodeGen/LoongArch/legalicmpimm.ll
@@ -12,7 +12,10 @@ define i32 @icmpimm(i32 %x) {
;
; LA64-LABEL: icmpimm:
; LA64: # %bb.0:
-; LA64-NEXT: bstrpick.d $a0, $a0, 31, 12
+; LA64-NEXT: lu12i.w $a1, -1
+; LA64-NEXT: lu32i.d $a1, 0
+; LA64-NEXT: and $a0, $a0, $a1
+; LA64-NEXT: srli.d $a0, $a0, 12
; LA64-NEXT: addi.d $a0, $a0, -1
; LA64-NEXT: sltui $a0, $a0, 1
; LA64-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/rotl-rotr.ll b/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
index 8646771e5d48a1..3b4ac67c65631d 100644
--- a/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
+++ b/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
@@ -591,9 +591,12 @@ define signext i32 @rotr_i32_fshr(i32 signext %a) nounwind {
;
; LA64-LABEL: rotr_i32_fshr:
; LA64: # %bb.0:
-; LA64-NEXT: slli.d $a1, $a0, 20
-; LA64-NEXT: bstrpick.d $a0, $a0, 31, 12
-; LA64-NEXT: or $a0, $a0, $a1
+; LA64-NEXT: lu12i.w $a1, -1
+; LA64-NEXT: lu32i.d $a1, 0
+; LA64-NEXT: and $a1, $a0, $a1
+; LA64-NEXT: srli.d $a1, $a1, 12
+; LA64-NEXT: slli.d $a0, $a0, 20
+; LA64-NEXT: or $a0, $a1, $a0
; LA64-NEXT: addi.w $a0, $a0, 0
; LA64-NEXT: ret
%or = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 12)
diff --git a/llvm/test/CodeGen/LoongArch/sextw-removal.ll b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
index 7708873e264d9c..b80e627b618e21 100644
--- a/llvm/test/CodeGen/LoongArch/sextw-removal.ll
+++ b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
@@ -148,6 +148,7 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
; CHECK-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
; CHECK-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
; CHECK-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT: st.d $s3, $sp, 0 # 8-byte Folded Spill
; CHECK-NEXT: sra.w $a1, $a0, $a1
; CHECK-NEXT: lu12i.w $a0, 349525
; CHECK-NEXT: ori $fp, $a0, 1365
@@ -157,6 +158,8 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
; CHECK-NEXT: ori $s1, $a0, 3855
; CHECK-NEXT: lu12i.w $a0, 4112
; CHECK-NEXT: ori $s2, $a0, 257
+; CHECK-NEXT: lu12i.w $s3, -4096
+; CHECK-NEXT: lu32i.d $s3, 0
; CHECK-NEXT: .p2align 4, , 16
; CHECK-NEXT: .LBB4_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
@@ -173,9 +176,11 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
; CHECK-NEXT: add.d $a1, $a1, $a2
; CHECK-NEXT: and $a1, $a1, $s1
; CHECK-NEXT: mul.d $a1, $a1, $s2
-; CHECK-NEXT: bstrpick.d $a1, $a1, 31, 24
+; CHECK-NEXT: and $a1, $a1, $s3
+; CHECK-NEXT: srli.d $a1, $a1, 24
; CHECK-NEXT: bnez $a0, .LBB4_1
; CHECK-NEXT: # %bb.2: # %bb7
+; CHECK-NEXT: ld.d $s3, $sp, 0 # 8-byte Folded Reload
; CHECK-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload
; CHECK-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
; CHECK-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
@@ -192,6 +197,7 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
; NORMV-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
; NORMV-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
; NORMV-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT: st.d $s3, $sp, 0 # 8-byte Folded Spill
; NORMV-NEXT: sra.w $a1, $a0, $a1
; NORMV-NEXT: lu12i.w $a0, 349525
; NORMV-NEXT: ori $fp, $a0, 1365
@@ -201,6 +207,8 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
; NORMV-NEXT: ori $s1, $a0, 3855
; NORMV-NEXT: lu12i.w $a0, 4112
; NORMV-NEXT: ori $s2, $a0, 257
+; NORMV-NEXT: lu12i.w $s3, -4096
+; NORMV-NEXT: lu32i.d $s3, 0
; NORMV-NEXT: .p2align 4, , 16
; NORMV-NEXT: .LBB4_1: # %bb2
; NORMV-NEXT: # =>This Inner Loop Header: Depth=1
@@ -217,9 +225,11 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
; NORMV-NEXT: add.d $a1, $a1, $a2
; NORMV-NEXT: and $a1, $a1, $s1
; NORMV-NEXT: mul.d $a1, $a1, $s2
-; NORMV-NEXT: bstrpick.d $a1, $a1, 31, 24
+; NORMV-NEXT: and $a1, $a1, $s3
+; NORMV-NEXT: srli.d $a1, $a1, 24
; NORMV-NEXT: bnez $a0, .LBB4_1
; NORMV-NEXT: # %bb.2: # %bb7
+; NORMV-NEXT: ld.d $s3, $sp, 0 # 8-byte Folded Reload
; NORMV-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload
; NORMV-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
; NORMV-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
@@ -1084,38 +1094,50 @@ define signext i32 @bug(i32 signext %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: beqz $a0, .LBB18_2
; CHECK-NEXT: # %bb.1: # %if.end
-; CHECK-NEXT: bstrpick.d $a1, $a0, 31, 16
+; CHECK-NEXT: lu12i.w $a1, -16
+; CHECK-NEXT: lu32i.d $a1, 0
+; CHECK-NEXT: and $a1, $a0, $a1
+; CHECK-NEXT: srli.d $a1, $a1, 16
; CHECK-NEXT: sltui $a1, $a1, 1
; CHECK-NEXT: slli.d $a2, $a0, 16
; CHECK-NEXT: masknez $a0, $a0, $a1
; CHECK-NEXT: maskeqz $a2, $a2, $a1
; CHECK-NEXT: or $a0, $a2, $a0
-; CHECK-NEXT: ori $a2, $zero, 32
-; CHECK-NEXT: masknez $a2, $a2, $a1
-; CHECK-NEXT: ori $a3, $zero, 16
-; CHECK-NEXT: maskeqz $a1, $a3, $a1
-; CHECK-NEXT: or $a1, $a1, $a2
-; CHECK-NEXT: bstrpick.d $a2, $a0, 31, 24
+; CHECK-NEXT: lu12i.w $a2, -4096
+; CHECK-NEXT: lu32i.d $a2, 0
+; CHECK-NEXT: and $a2, $a0, $a2
+; CHECK-NEXT: ori $a3, $zero, 32
+; CHECK-NEXT: masknez $a3, $a3, $a1
+; CHECK-NEXT: ori $a4, $zero, 16
+; CHECK-NEXT: maskeqz $a1, $a4, $a1
+; CHECK-NEXT: or $a1, $a1, $a3
+; CHECK-NEXT: srli.d $a2, $a2, 24
; CHECK-NEXT: sltui $a2, $a2, 1
; CHECK-NEXT: slli.d $a3, $a0, 8
; CHECK-NEXT: addi.d $a4, $a1, -8
; CHECK-NEXT: masknez $a0, $a0, $a2
; CHECK-NEXT: maskeqz $a3, $a3, $a2
; CHECK-NEXT: or $a0, $a3, $a0
+; CHECK-NEXT: lu12i.w $a3, -65536
+; CHECK-NEXT: lu32i.d $a3, 0
+; CHECK-NEXT: and $a3, $a0, $a3
; CHECK-NEXT: masknez $a1, $a1, $a2
; CHECK-NEXT: maskeqz $a2, $a4, $a2
; CHECK-NEXT: or $a1, $a2, $a1
-; CHECK-NEXT: bstrpick.d $a2, $a0, 31, 28
+; CHECK-NEXT: srli.d $a2, $a3, 28
; CHECK-NEXT: sltui $a2, $a2, 1
; CHECK-NEXT: slli.d $a3, $a0, 4
; CHECK-NEXT: addi.d $a4, $a1, -4
; CHECK-NEXT: masknez $a0, $a0, $a2
; CHECK-NEXT: maskeqz $a3, $a3, $a2
; CHECK-NEXT: or $a0, $a3, $a0
+; CHECK-NEXT: lu12i.w $a3, -262144
+; CHECK-NEXT: lu32i.d $a3, 0
+; CHECK-NEXT: and $a3, $a0, $a3
; CHECK-NEXT: masknez $a1, $a1, $a2
; CHECK-NEXT: maskeqz $a2, $a4, $a2
; CHECK-NEXT: or $a1, $a2, $a1
-; CHECK-NEXT: bstrpick.d $a2, $a0, 31, 30
+; CHECK-NEXT: srli.d $a2, $a3, 30
; CHECK-NEXT: sltui $a2, $a2, 1
; CHECK-NEXT: slli.d $a3, $a0, 2
; CHECK-NEXT: addi.d $a4, $a1, -2
@@ -1138,38 +1160,50 @@ define signext i32 @bug(i32 signext %x) {
; NORMV: # %bb.0: # %entry
; NORMV-NEXT: beqz $a0, .LBB18_2
; NORMV-NEXT: # %bb.1: # %if.end
-; NORMV-NEXT: bstrpick.d $a1, $a0, 31, 16
+; NORMV-NEXT: lu12i.w $a1, -16
+; NORMV-NEXT: lu32i.d $a1, 0
+; NORMV-NEXT: and $a1, $a0, $a1
+; NORMV-NEXT: srli.d $a1, $a1, 16
; NORMV-NEXT: sltui $a1, $a1, 1
; NORMV-NEXT: slli.d $a2, $a0, 16
; NORMV-NEXT: masknez $a0, $a0, $a1
; NORMV-NEXT: maskeqz $a2, $a2, $a1
; NORMV-NEXT: or $a0, $a2, $a0
-; NORMV-NEXT: ori $a2, $zero, 32
-; NORMV-NEXT: masknez $a2, $a2, $a1
-; NORMV-NEXT: ori $a3, $zero, 16
-; NORMV-NEXT: maskeqz $a1, $a3, $a1
-; NORMV-NEXT: or $a1, $a1, $a2
-; NORMV-NEXT: bstrpick.d $a2, $a0, 31, 24
+; NORMV-NEXT: lu12i.w $a2, -4096
+; NORMV-NEXT: lu32i.d $a2, 0
+; NORMV-NEXT: and $a2, $a0, $a2
+; NORMV-NEXT: ori $a3, $zero, 32
+; NORMV-NEXT: masknez $a3, $a3, $a1
+; NORMV-NEXT: ori $a4, $zero, 16
+; NORMV-NEXT: maskeqz $a1, $a4, $a1
+; NORMV-NEXT: or $a1, $a1, $a3
+; NORMV-NEXT: srli.d $a2, $a2, 24
; NORMV-NEXT: sltui $a2, $a2, 1
; NORMV-NEXT: slli.d $a3, $a0, 8
; NORMV-NEXT: addi.d $a4, $a1, -8
; NORMV-NEXT: masknez $a0, $a0, $a2
; NORMV-NEXT: maskeqz $a3, $a3, $a2
; NORMV-NEXT: or $a0, $a3, $a0
+; NORMV-NEXT: lu12i.w $a3, -65536
+; NORMV-NEXT: lu32i.d $a3, 0
+; NORMV-NEXT: and $a3, $a0, $a3
; NORMV-NEXT: masknez $a1, $a1, $a2
; NORMV-NEXT: maskeqz $a2, $a4, $a2
; NORMV-NEXT: or $a1, $a2, $a1
-; NORMV-NEXT: bstrpick.d $a2, $a0, 31, 28
+; NORMV-NEXT: srli.d $a2, $a3, 28
; NORMV-NEXT: sltui $a2, $a2, 1
; NORMV-NEXT: slli.d $a3, $a0, 4
; NORMV-NEXT: addi.d $a4, $a1, -4
; NORMV-NEXT: masknez $a0, $a0, $a2
; NORMV-NEXT: maskeqz $a3, $a3, $a2
; NORMV-NEXT: or $a0, $a3, $a0
+; NORMV-NEXT: lu12i.w $a3, -262144
+; NORMV-NEXT: lu32i.d $a3, 0
+; NORMV-NEXT: and $a3, $a0, $a3
; NORMV-NEXT: masknez $a1, $a1, $a2
; NORMV-NEXT: maskeqz $a2, $a4, $a2
; NORMV-NEXT: or $a1, $a2, $a1
-; NORMV-NEXT: bstrpick.d $a2, $a0, 31, 30
+; NORMV-NEXT: srli.d $a2, $a3, 30
; NORMV-NEXT: sltui $a2, $a2, 1
; NORMV-NEXT: slli.d $a3, $a0, 2
; NORMV-NEXT: addi.d $a4, $a1, -2
diff --git a/llvm/test/CodeGen/Mips/atomic.ll b/llvm/test/CodeGen/Mips/atomic.ll
index eaf99cc7023a31..13ef414ce0f46d 100644
--- a/llvm/test/CodeGen/Mips/atomic.ll
+++ b/llvm/test/CodeGen/Mips/atomic.ll
@@ -4869,7 +4869,9 @@ define i1 @AtomicCmpSwapRes8(ptr %ptr, i8 signext %oldval, i8 signext %newval) n
; MIPS32-NEXT: sll $1, $1, 24
; MIPS32-NEXT: sra $1, $1, 24
; MIPS32-NEXT: # %bb.4: # %entry
-; MIPS32-NEXT: xor $1, $1, $5
+; MIPS32-NEXT: sll $2, $5, 24
+; MIPS32-NEXT: sra $2, $2, 24
+; MIPS32-NEXT: xor $1, $1, $2
; MIPS32-NEXT: jr $ra
; MIPS32-NEXT: sltiu $2, $1, 1
;
@@ -5062,7 +5064,9 @@ define i1 @AtomicCmpSwapRes8(ptr %ptr, i8 signext %oldval, i8 signext %newval) n
; MIPS4-NEXT: sll $1, $1, 24
; MIPS4-NEXT: sra $1, $1, 24
; MIPS4-NEXT: # %bb.4: # %entry
-; MIPS4-NEXT: xor $1, $1, $5
+; MIPS4-NEXT: sll $2, $5, 24
+; MIPS4-NEXT: sra $2, $2, 24
+; MIPS4-NEXT: xor $1, $1, $2
; MIPS4-NEXT: jr $ra
; MIPS4-NEXT: sltiu $2, $1, 1
;
@@ -5097,7 +5101,9 @@ define i1 @AtomicCmpSwapRes8(ptr %ptr, i8 signext %oldval, i8 signext %newval) n
; MIPS64-NEXT: sll $1, $1, 24
; MIPS64-NEXT: sra $1, $1, 24
; MIPS64-NEXT: # %bb.4: # %entry
-; MIPS64-NEXT: xor $1, $1, $5
+; MIPS64-NEXT: sll $2, $5, 24
+; MIPS64-NEXT: sra $2, $2, 24
+; MIPS64-NEXT: xor $1, $1, $2
; MIPS64-NEXT: jr $ra
; MIPS64-NEXT: sltiu $2, $1, 1
;
@@ -5274,7 +5280,9 @@ define i1 @AtomicCmpSwapRes8(ptr %ptr, i8 signext %oldval, i8 signext %newval) n
; O1-NEXT: sll $1, $1, 24
; O1-NEXT: sra $1, $1, 24
; O1-NEXT: # %bb.4: # %entry
-; O1-NEXT: xor $1, $1, $5
+; O1-NEXT: sll $2, $5, 24
+; O1-NEXT: sra $2, $2, 24
+; O1-NEXT: xor $1, $1, $2
; O1-NEXT: jr $ra
; O1-NEXT: sltiu $2, $1, 1
;
@@ -5309,7 +5317,9 @@ define i1 @AtomicCmpSwapRes8(ptr %ptr, i8 signext %oldval, i8 signext %newval) n
; O2-NEXT: sll $1, $1, 24
; O2-NEXT: sra $1, $1, 24
; O2-NEXT: # %bb.4: # %entry
-; O2-NEXT: xor $1, $1, $5
+; O2-NEXT: sll $2, $5, 24
+; O2-NEXT: sra $2, $2, 24
+; O2-NEXT: xor $1, $1, $2
; O2-NEXT: jr $ra
; O2-NEXT: sltiu $2, $1, 1
;
@@ -5344,7 +5354,9 @@ define i1 @AtomicCmpSwapRes8(ptr %ptr, i8 signext %oldval, i8 signext %newval) n
; O3-NEXT: sll $1, $1, 24
; O3-NEXT: sra $1, $1, 24
; O3-NEXT: # %bb.4: # %entry
-; O3-NEXT: xor $1, $1, $5
+; O3-NEXT: sll $2, $5, 24
+; O3-NEXT: sra $2, $2, 24
+; O3-NEXT: xor $1, $1, $2
; O3-NEXT: jr $ra
; O3-NEXT: sltiu $2, $1, 1
;
@@ -5380,7 +5392,9 @@ define i1 @AtomicCmpSwapRes8(ptr %ptr, i8 signext %oldval, i8 signext %newval) n
; MIPS32EB-NEXT: sll $1, $1, 24
; MIPS32EB-NEXT: sra $1, $1, 24
; MIPS32EB-NEXT: # %bb.4: # %entry
-; MIPS32EB-NEXT: xor $1, $1, $5
+; MIPS32EB-NEXT: sll $2, $5, 24
+; MIPS32EB-NEXT: sra $2, $2, 24
+; MIPS32EB-NEXT: xor $1, $1, $2
; MIPS32EB-NEXT: jr $ra
; MIPS32EB-NEXT: sltiu $2, $1, 1
entry:
diff --git a/llvm/test/CodeGen/Mips/cconv/illegal-vectors.ll b/llvm/test/CodeGen/Mips/cconv/illegal-vectors.ll
index 5cb5972f677536..16bd4205120bba 100644
--- a/llvm/test/CodeGen/Mips/cconv/illegal-vectors.ll
+++ b/llvm/test/CodeGen/Mips/cconv/illegal-vectors.ll
@@ -680,10 +680,10 @@ define <3 x i24> @ret_v3i24(ptr %p) {
; MIPS64-NEXT: or $2, $2, $5
; MIPS64-NEXT: or $1, $1, $3
; MIPS64-NEXT: lbu $3, 4($4)
-; MIPS64-NEXT: sll $3, $3, 8
; MIPS64-NEXT: lb $5, 3($4)
-; MIPS64-NEXT: sll $5, $5, 16
+; MIPS64-NEXT: sll $5, $5, 8
; MIPS64-NEXT: or $3, $5, $3
+; MIPS64-NEXT: sll $3, $3, 8
; MIPS64-NEXT: lbu $4, 5($4)
; MIPS64-NEXT: or $3, $4, $3
; MIPS64-NEXT: jr $ra
@@ -700,10 +700,10 @@ define <3 x i24> @ret_v3i24(ptr %p) {
; MIPS32-NEXT: or $2, $2, $5
; MIPS32-NEXT: or $1, $1, $3
; MIPS32-NEXT: lbu $3, 4($4)
-; MIPS32-NEXT: sll $3, $3, 8
; MIPS32-NEXT: lb $5, 3($4)
-; MIPS32-NEXT: sll $5, $5, 16
+; MIPS32-NEXT: sll $5, $5, 8
; MIPS32-NEXT: or $3, $5, $3
+; MIPS32-NEXT: sll $3, $3, 8
; MIPS32-NEXT: lbu $4, 5($4)
; MIPS32-NEXT: or $3, $4, $3
; MIPS32-NEXT: jr $ra
@@ -719,19 +719,19 @@ define void @call_v3i24(ptr %p) nounwind {
; MIPS64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill
; MIPS64-NEXT: move $16, $4
-; MIPS64-NEXT: lbu $1, 4($4)
-; MIPS64-NEXT: lbu $2, 8($4)
-; MIPS64-NEXT: lh $3, 6($4)
-; MIPS64-NEXT: dsll $3, $3, 8
-; MIPS64-NEXT: lbu $4, 2($4)
-; MIPS64-NEXT: lh $5, 0($16)
-; MIPS64-NEXT: dsll $5, $5, 8
-; MIPS64-NEXT: or $4, $4, $5
-; MIPS64-NEXT: or $6, $2, $3
-; MIPS64-NEXT: dsll $1, $1, 8
+; MIPS64-NEXT: lbu $1, 8($4)
+; MIPS64-NEXT: lh $2, 6($4)
+; MIPS64-NEXT: dsll $2, $2, 8
+; MIPS64-NEXT: lbu $3, 2($4)
+; MIPS64-NEXT: lh $4, 0($4)
+; MIPS64-NEXT: dsll $4, $4, 8
+; MIPS64-NEXT: or $4, $3, $4
+; MIPS64-NEXT: or $6, $1, $2
+; MIPS64-NEXT: lbu $1, 4($16)
; MIPS64-NEXT: lb $2, 3($16)
-; MIPS64-NEXT: dsll $2, $2, 16
+; MIPS64-NEXT: dsll $2, $2, 8
; MIPS64-NEXT: or $1, $2, $1
+; MIPS64-NEXT: dsll $1, $1, 8
; MIPS64-NEXT: lbu $2, 5($16)
; MIPS64-NEXT: jal arg_v3i24
; MIPS64-NEXT: or $5, $2, $1
@@ -759,19 +759,19 @@ define void @call_v3i24(ptr %p) nounwind {
; MIPS32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill
; MIPS32-NEXT: sw $16, 16($sp) # 4-byte Folded Spill
; MIPS32-NEXT: move $16, $4
-; MIPS32-NEXT: lbu $1, 4($4)
-; MIPS32-NEXT: lbu $2, 8($4)
-; MIPS32-NEXT: lh $3, 6($4)
-; MIPS32-NEXT: sll $3, $3, 8
-; MIPS32-NEXT: lbu $4, 2($4)
-; MIPS32-NEXT: lhu $5, 0($16)
-; MIPS32-NEXT: sll $5, $5, 8
-; MIPS32-NEXT: or $4, $4, $5
-; MIPS32-NEXT: or $6, $2, $3
-; MIPS32-NEXT: sll $1, $1, 8
+; MIPS32-NEXT: lbu $1, 8($4)
+; MIPS32-NEXT: lh $2, 6($4)
+; MIPS32-NEXT: sll $2, $2, 8
+; MIPS32-NEXT: lbu $3, 2($4)
+; MIPS32-NEXT: lhu $4, 0($4)
+; MIPS32-NEXT: sll $4, $4, 8
+; MIPS32-NEXT: or $4, $3, $4
+; MIPS32-NEXT: or $6, $1, $2
+; MIPS32-NEXT: lbu $1, 4($16)
; MIPS32-NEXT: lb $2, 3($16)
-; MIPS32-NEXT: sll $2, $2, 16
+; MIPS32-NEXT: sll $2, $2, 8
; MIPS32-NEXT: or $1, $2, $1
+; MIPS32-NEXT: sll $1, $1, 8
; MIPS32-NEXT: lbu $2, 5($16)
; MIPS32-NEXT: jal arg_v3i24
; MIPS32-NEXT: or $5, $2, $1
@@ -1044,34 +1044,33 @@ define void @arg_v4i18(<4 x i18> %vec, ptr %p) {
;
; MIPS32-LABEL: arg_v4i18:
; MIPS32: # %bb.0:
-; MIPS32-NEXT: sll $1, $4, 14
-; MIPS32-NEXT: lui $2, 63
-; MIPS32-NEXT: lui $3, 65280
-; MIPS32-NEXT: and $1, $1, $3
-; MIPS32-NEXT: ori $2, $2, 65280
-; MIPS32-NEXT: sll $3, $5, 4
-; MIPS32-NEXT: and $2, $3, $2
-; MIPS32-NEXT: sll $4, $4, 22
-; MIPS32-NEXT: or $2, $4, $2
-; MIPS32-NEXT: srl $2, $2, 8
-; MIPS32-NEXT: lui $4, 3
-; MIPS32-NEXT: or $1, $1, $2
-; MIPS32-NEXT: ori $2, $4, 65280
-; MIPS32-NEXT: and $2, $7, $2
-; MIPS32-NEXT: sll $5, $6, 18
-; MIPS32-NEXT: or $2, $5, $2
-; MIPS32-NEXT: lw $5, 16($sp)
-; MIPS32-NEXT: sb $7, 8($5)
-; MIPS32-NEXT: sw $1, 0($5)
-; MIPS32-NEXT: srl $1, $2, 8
-; MIPS32-NEXT: ori $2, $4, 49152
+; MIPS32-NEXT: lui $1, 3
+; MIPS32-NEXT: ori $2, $1, 49152
; MIPS32-NEXT: and $2, $6, $2
; MIPS32-NEXT: srl $2, $2, 14
+; MIPS32-NEXT: ori $3, $1, 65535
+; MIPS32-NEXT: and $3, $5, $3
+; MIPS32-NEXT: sll $3, $3, 4
; MIPS32-NEXT: or $2, $3, $2
; MIPS32-NEXT: sll $2, $2, 24
+; MIPS32-NEXT: ori $1, $1, 65280
+; MIPS32-NEXT: and $1, $7, $1
+; MIPS32-NEXT: sll $5, $6, 18
+; MIPS32-NEXT: or $1, $5, $1
+; MIPS32-NEXT: srl $1, $1, 8
; MIPS32-NEXT: or $1, $1, $2
+; MIPS32-NEXT: lw $2, 16($sp)
+; MIPS32-NEXT: sb $7, 8($2)
+; MIPS32-NEXT: sw $1, 4($2)
+; MIPS32-NEXT: sll $1, $4, 14
+; MIPS32-NEXT: lui $5, 65280
+; MIPS32-NEXT: and $1, $1, $5
+; MIPS32-NEXT: sll $4, $4, 22
+; MIPS32-NEXT: or $3, $4, $3
+; MIPS32-NEXT: srl $3, $3, 8
+; MIPS32-NEXT: or $1, $1, $3
; MIPS32-NEXT: jr $ra
-; MIPS32-NEXT: sw $1, 4($5)
+; MIPS32-NEXT: sw $1, 0($2)
store <4 x i18> %vec, ptr %p
ret void
}
@@ -1090,13 +1089,11 @@ define <4 x i18> @ret_v4i18(ptr %p) {
; MIPS64-NEXT: ori $5, $3, 65535
; MIPS64-NEXT: dsrl $3, $2, 28
; MIPS64-NEXT: sll $3, $3, 0
-; MIPS64-NEXT: lui $7, 3
; MIPS64-NEXT: and $3, $3, $5
; MIPS64-NEXT: and $4, $4, $5
; MIPS64-NEXT: and $5, $1, $5
-; MIPS64-NEXT: ori $1, $7, 64512
-; MIPS64-NEXT: dsrl $2, $2, 46
-; MIPS64-NEXT: and $1, $2, $1
+; MIPS64-NEXT: dsrl $1, $2, 56
+; MIPS64-NEXT: dsll $1, $1, 10
; MIPS64-NEXT: dsrl $2, $6, 54
; MIPS64-NEXT: or $1, $2, $1
; MIPS64-NEXT: jr $ra
@@ -1109,17 +1106,16 @@ define <4 x i18> @ret_v4i18(ptr %p) {
; MIPS32-NEXT: sll $6, $2, 8
; MIPS32-NEXT: lui $3, 3
; MIPS32-NEXT: or $1, $1, $6
-; MIPS32-NEXT: ori $5, $3, 64512
-; MIPS32-NEXT: lw $4, 0($4)
-; MIPS32-NEXT: srl $7, $4, 14
-; MIPS32-NEXT: and $5, $7, $5
; MIPS32-NEXT: srl $7, $2, 24
; MIPS32-NEXT: ori $8, $3, 65535
-; MIPS32-NEXT: sll $3, $4, 8
-; MIPS32-NEXT: srl $2, $3, 22
-; MIPS32-NEXT: or $2, $2, $5
+; MIPS32-NEXT: lw $2, 0($4)
+; MIPS32-NEXT: srl $3, $2, 24
+; MIPS32-NEXT: sll $3, $3, 10
+; MIPS32-NEXT: sll $4, $2, 8
+; MIPS32-NEXT: srl $2, $4, 22
+; MIPS32-NEXT: or $2, $2, $3
; MIPS32-NEXT: and $5, $1, $8
-; MIPS32-NEXT: or $1, $3, $7
+; MIPS32-NEXT: or $1, $4, $7
; MIPS32-NEXT: srl $1, $1, 4
; MIPS32-NEXT: and $3, $1, $8
; MIPS32-NEXT: sll $1, $7, 14
@@ -1141,21 +1137,20 @@ define void @call_v4i18(ptr %p) nounwind {
; MIPS64-NEXT: sd $17, 16($sp) # 8-byte Folded Spill
; MIPS64-NEXT: sd $16, 8($sp) # 8-byte Folded Spill
; MIPS64-NEXT: move $16, $4
+; MIPS64-NEXT: ld $1, 0($4)
+; MIPS64-NEXT: dsrl $2, $1, 56
+; MIPS64-NEXT: dsrl $3, $1, 10
; MIPS64-NEXT: lui $17, 3
-; MIPS64-NEXT: ori $1, $17, 64512
-; MIPS64-NEXT: ld $2, 0($4)
-; MIPS64-NEXT: dsrl $3, $2, 46
-; MIPS64-NEXT: dsrl $4, $2, 10
; MIPS64-NEXT: ori $18, $17, 65535
-; MIPS64-NEXT: dsrl $5, $2, 28
-; MIPS64-NEXT: and $5, $5, $18
-; MIPS64-NEXT: and $6, $4, $18
-; MIPS64-NEXT: and $1, $3, $1
-; MIPS64-NEXT: dsll $2, $2, 8
-; MIPS64-NEXT: dsrl $3, $2, 54
-; MIPS64-NEXT: or $4, $3, $1
-; MIPS64-NEXT: lbu $1, 8($16)
-; MIPS64-NEXT: or $1, $1, $2
+; MIPS64-NEXT: dsrl $4, $1, 28
+; MIPS64-NEXT: and $5, $4, $18
+; MIPS64-NEXT: and $6, $3, $18
+; MIPS64-NEXT: dsll $2, $2, 10
+; MIPS64-NEXT: dsll $1, $1, 8
+; MIPS64-NEXT: dsrl $3, $1, 54
+; MIPS64-NEXT: or $4, $3, $2
+; MIPS64-NEXT: lbu $2, 8($16)
+; MIPS64-NEXT: or $1, $2, $1
; MIPS64-NEXT: jal arg_v4i18
; MIPS64-NEXT: and $7, $1, $18
; MIPS64-NEXT: daddiu $1, $zero, 255
@@ -1208,40 +1203,37 @@ define void @call_v4i18(ptr %p) nounwind {
; MIPS32-NEXT: srl $5, $5, 4
; MIPS32-NEXT: or $6, $6, $1
; MIPS32-NEXT: lui $17, 3
-; MIPS32-NEXT: ori $7, $17, 64512
-; MIPS32-NEXT: srl $3, $3, 14
-; MIPS32-NEXT: and $3, $3, $7
-; MIPS32-NEXT: ori $8, $17, 65535
+; MIPS32-NEXT: ori $18, $17, 65535
+; MIPS32-NEXT: srl $3, $3, 24
+; MIPS32-NEXT: sll $3, $3, 10
; MIPS32-NEXT: srl $4, $4, 22
; MIPS32-NEXT: or $4, $4, $3
-; MIPS32-NEXT: and $7, $6, $8
-; MIPS32-NEXT: and $5, $5, $8
+; MIPS32-NEXT: and $7, $6, $18
+; MIPS32-NEXT: and $5, $5, $18
; MIPS32-NEXT: sll $2, $2, 14
; MIPS32-NEXT: srl $1, $1, 18
; MIPS32-NEXT: or $1, $1, $2
; MIPS32-NEXT: jal arg_v4i18
-; MIPS32-NEXT: and $6, $1, $8
-; MIPS32-NEXT: ori $18, $17, 49152
-; MIPS32-NEXT: ori $17, $17, 65280
-; MIPS32-NEXT: lui $1, 63
+; MIPS32-NEXT: and $6, $1, $18
+; MIPS32-NEXT: ori $19, $17, 65280
; MIPS32-NEXT: jal ret_v4i18
-; MIPS32-NEXT: ori $19, $1, 65280
+; MIPS32-NEXT: ori $17, $17, 49152
; MIPS32-NEXT: lui $1, 65280
-; MIPS32-NEXT: and $6, $5, $17
-; MIPS32-NEXT: sll $7, $4, 18
-; MIPS32-NEXT: or $6, $7, $6
-; MIPS32-NEXT: srl $6, $6, 8
-; MIPS32-NEXT: and $4, $4, $18
-; MIPS32-NEXT: srl $4, $4, 14
+; MIPS32-NEXT: and $6, $4, $17
+; MIPS32-NEXT: srl $6, $6, 14
+; MIPS32-NEXT: and $3, $3, $18
; MIPS32-NEXT: sll $3, $3, 4
-; MIPS32-NEXT: or $4, $3, $4
-; MIPS32-NEXT: sll $4, $4, 24
-; MIPS32-NEXT: or $4, $6, $4
-; MIPS32-NEXT: sll $6, $2, 14
+; MIPS32-NEXT: or $6, $3, $6
+; MIPS32-NEXT: sll $6, $6, 24
+; MIPS32-NEXT: and $7, $5, $19
+; MIPS32-NEXT: sll $4, $4, 18
+; MIPS32-NEXT: or $4, $4, $7
+; MIPS32-NEXT: srl $4, $4, 8
+; MIPS32-NEXT: or $4, $4, $6
; MIPS32-NEXT: sb $5, 8($16)
; MIPS32-NEXT: sw $4, 4($16)
-; MIPS32-NEXT: and $1, $6, $1
-; MIPS32-NEXT: and $3, $3, $19
+; MIPS32-NEXT: sll $4, $2, 14
+; MIPS32-NEXT: and $1, $4, $1
; MIPS32-NEXT: sll $2, $2, 22
; MIPS32-NEXT: or $2, $2, $3
; MIPS32-NEXT: srl $2, $2, 8
@@ -1341,21 +1333,85 @@ define void @arg_v7i18(<7 x i18> %vec, ptr %p) {
define <7 x i18> @ret_v7i18(ptr %p) {
; MIPS64-LABEL: ret_v7i18:
; MIPS64: # %bb.0:
-; MIPS64-NEXT: ld $1, 0($5)
+; MIPS64-NEXT: lui $1, 3
+; MIPS64-NEXT: ori $2, $1, 65535
+; MIPS64-NEXT: ld $3, 0($5)
+; MIPS64-NEXT: dsrl $6, $3, 26
+; MIPS64-NEXT: and $6, $6, $2
+; MIPS64-NEXT: dsll $6, $6, 26
+; MIPS64-NEXT: dsrl $7, $3, 44
+; MIPS64-NEXT: dsll $7, $7, 44
+; MIPS64-NEXT: or $6, $7, $6
+; MIPS64-NEXT: dsrl $7, $3, 8
+; MIPS64-NEXT: and $7, $7, $2
+; MIPS64-NEXT: dsll $7, $7, 8
+; MIPS64-NEXT: ori $1, $1, 64512
+; MIPS64-NEXT: or $6, $6, $7
+; MIPS64-NEXT: dsll $3, $3, 10
+; MIPS64-NEXT: and $1, $3, $1
+; MIPS64-NEXT: dsrl $1, $1, 10
+; MIPS64-NEXT: or $1, $6, $1
+; MIPS64-NEXT: ld $3, 8($5)
+; MIPS64-NEXT: dsrl $5, $3, 36
+; MIPS64-NEXT: and $5, $5, $2
+; MIPS64-NEXT: dsll $5, $5, 36
+; MIPS64-NEXT: dsrl $6, $3, 54
+; MIPS64-NEXT: dsll $6, $6, 54
; MIPS64-NEXT: sd $1, 0($4)
-; MIPS64-NEXT: ld $1, 8($5)
+; MIPS64-NEXT: or $1, $6, $5
+; MIPS64-NEXT: dsrl $5, $3, 18
+; MIPS64-NEXT: and $5, $5, $2
+; MIPS64-NEXT: dsll $5, $5, 18
+; MIPS64-NEXT: or $1, $1, $5
+; MIPS64-NEXT: and $2, $3, $2
+; MIPS64-NEXT: or $1, $1, $2
; MIPS64-NEXT: jr $ra
; MIPS64-NEXT: sd $1, 8($4)
;
; MIPS32-LABEL: ret_v7i18:
; MIPS32: # %bb.0:
-; MIPS32-NEXT: lw $1, 0($5)
-; MIPS32-NEXT: sw $1, 0($4)
-; MIPS32-NEXT: lw $1, 4($5)
-; MIPS32-NEXT: sw $1, 4($4)
-; MIPS32-NEXT: lw $1, 12($5)
-; MIPS32-NEXT: sw $1, 12($4)
-; MIPS32-NEXT: lw $1, 8($5)
+; MIPS32-NEXT: lui $1, 3
+; MIPS32-NEXT: ori $2, $1, 65535
+; MIPS32-NEXT: lw $3, 4($5)
+; MIPS32-NEXT: srl $6, $3, 8
+; MIPS32-NEXT: and $6, $6, $2
+; MIPS32-NEXT: ori $7, $1, 65472
+; MIPS32-NEXT: lw $8, 0($5)
+; MIPS32-NEXT: sll $9, $8, 6
+; MIPS32-NEXT: sll $6, $6, 8
+; MIPS32-NEXT: and $7, $9, $7
+; MIPS32-NEXT: srl $9, $3, 26
+; MIPS32-NEXT: sll $9, $9, 26
+; MIPS32-NEXT: lw $10, 8($5)
+; MIPS32-NEXT: srl $11, $10, 4
+; MIPS32-NEXT: srl $7, $7, 6
+; MIPS32-NEXT: and $11, $11, $2
+; MIPS32-NEXT: or $6, $9, $6
+; MIPS32-NEXT: ori $9, $1, 64512
+; MIPS32-NEXT: sll $3, $3, 10
+; MIPS32-NEXT: and $3, $3, $9
+; MIPS32-NEXT: srl $3, $3, 10
+; MIPS32-NEXT: lw $5, 12($5)
+; MIPS32-NEXT: srl $8, $8, 12
+; MIPS32-NEXT: sll $8, $8, 12
+; MIPS32-NEXT: or $3, $6, $3
+; MIPS32-NEXT: sll $6, $11, 4
+; MIPS32-NEXT: or $7, $8, $7
+; MIPS32-NEXT: and $2, $5, $2
+; MIPS32-NEXT: srl $5, $5, 18
+; MIPS32-NEXT: sll $5, $5, 18
+; MIPS32-NEXT: or $2, $5, $2
+; MIPS32-NEXT: srl $5, $10, 22
+; MIPS32-NEXT: sll $5, $5, 22
+; MIPS32-NEXT: sw $2, 12($4)
+; MIPS32-NEXT: sw $7, 0($4)
+; MIPS32-NEXT: sw $3, 4($4)
+; MIPS32-NEXT: or $2, $5, $6
+; MIPS32-NEXT: ori $1, $1, 49152
+; MIPS32-NEXT: sll $3, $10, 14
+; MIPS32-NEXT: and $1, $3, $1
+; MIPS32-NEXT: srl $1, $1, 14
+; MIPS32-NEXT: or $1, $2, $1
; MIPS32-NEXT: jr $ra
; MIPS32-NEXT: sw $1, 8($4)
%v = load <7 x i18>, ptr %p
@@ -1365,8 +1421,10 @@ define <7 x i18> @ret_v7i18(ptr %p) {
define void @call_v7i18(ptr %p) nounwind {
; MIPS64-LABEL: call_v7i18:
; MIPS64: # %bb.0:
-; MIPS64-NEXT: daddiu $sp, $sp, -32
-; MIPS64-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-NEXT: daddiu $sp, $sp, -48
+; MIPS64-NEXT: sd $ra, 40($sp) # 8-byte Folded Spill
+; MIPS64-NEXT: sd $18, 32($sp) # 8-byte Folded Spill
+; MIPS64-NEXT: sd $17, 24($sp) # 8-byte Folded Spill
; MIPS64-NEXT: sd $16, 16($sp) # 8-byte Folded Spill
; MIPS64-NEXT: move $16, $4
; MIPS64-NEXT: ld $1, 0($4)
@@ -1375,36 +1433,67 @@ define void @call_v7i18(ptr %p) nounwind {
; MIPS64-NEXT: dsrl $4, $2, 36
; MIPS64-NEXT: dsrl $6, $1, 8
; MIPS64-NEXT: dsrl $5, $1, 26
-; MIPS64-NEXT: lui $7, 3
-; MIPS64-NEXT: ori $7, $7, 65535
-; MIPS64-NEXT: and $10, $2, $7
-; MIPS64-NEXT: and $5, $5, $7
-; MIPS64-NEXT: and $6, $6, $7
-; MIPS64-NEXT: and $8, $4, $7
-; MIPS64-NEXT: and $9, $3, $7
+; MIPS64-NEXT: lui $17, 3
+; MIPS64-NEXT: ori $18, $17, 65535
+; MIPS64-NEXT: and $10, $2, $18
+; MIPS64-NEXT: and $5, $5, $18
+; MIPS64-NEXT: and $6, $6, $18
+; MIPS64-NEXT: and $8, $4, $18
+; MIPS64-NEXT: and $9, $3, $18
; MIPS64-NEXT: dsll $3, $1, 10
; MIPS64-NEXT: dsrl $2, $2, 54
; MIPS64-NEXT: or $2, $2, $3
-; MIPS64-NEXT: and $7, $2, $7
+; MIPS64-NEXT: and $7, $2, $18
; MIPS64-NEXT: jal arg_v7i18
; MIPS64-NEXT: dsrl $4, $1, 44
; MIPS64-NEXT: jal ret_v7i18
; MIPS64-NEXT: daddiu $4, $sp, 0
-; MIPS64-NEXT: ld $1, 0($sp)
+; MIPS64-NEXT: ori $1, $17, 64512
+; MIPS64-NEXT: ld $2, 0($sp)
+; MIPS64-NEXT: dsrl $3, $2, 26
+; MIPS64-NEXT: and $3, $3, $18
+; MIPS64-NEXT: dsll $3, $3, 26
+; MIPS64-NEXT: dsrl $4, $2, 44
+; MIPS64-NEXT: dsll $4, $4, 44
+; MIPS64-NEXT: or $3, $4, $3
+; MIPS64-NEXT: dsrl $4, $2, 8
+; MIPS64-NEXT: and $4, $4, $18
+; MIPS64-NEXT: dsll $4, $4, 8
+; MIPS64-NEXT: or $3, $3, $4
+; MIPS64-NEXT: dsll $2, $2, 10
+; MIPS64-NEXT: and $1, $2, $1
+; MIPS64-NEXT: dsrl $1, $1, 10
+; MIPS64-NEXT: or $1, $3, $1
+; MIPS64-NEXT: ld $2, 8($sp)
+; MIPS64-NEXT: dsrl $3, $2, 36
+; MIPS64-NEXT: and $3, $3, $18
+; MIPS64-NEXT: dsll $3, $3, 36
+; MIPS64-NEXT: dsrl $4, $2, 54
+; MIPS64-NEXT: dsll $4, $4, 54
; MIPS64-NEXT: sd $1, 0($16)
-; MIPS64-NEXT: ld $1, 8($sp)
+; MIPS64-NEXT: or $1, $4, $3
+; MIPS64-NEXT: dsrl $3, $2, 18
+; MIPS64-NEXT: and $3, $3, $18
+; MIPS64-NEXT: dsll $3, $3, 18
+; MIPS64-NEXT: or $1, $1, $3
+; MIPS64-NEXT: and $2, $2, $18
+; MIPS64-NEXT: or $1, $1, $2
; MIPS64-NEXT: sd $1, 8($16)
; MIPS64-NEXT: ld $16, 16($sp) # 8-byte Folded Reload
-; MIPS64-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-NEXT: ld $17, 24($sp) # 8-byte Folded Reload
+; MIPS64-NEXT: ld $18, 32($sp) # 8-byte Folded Reload
+; MIPS64-NEXT: ld $ra, 40($sp) # 8-byte Folded Reload
; MIPS64-NEXT: jr $ra
-; MIPS64-NEXT: daddiu $sp, $sp, 32
+; MIPS64-NEXT: daddiu $sp, $sp, 48
;
; MIPS32-LABEL: call_v7i18:
; MIPS32: # %bb.0:
-; MIPS32-NEXT: addiu $sp, $sp, -64
-; MIPS32-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill
-; MIPS32-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill
-; MIPS32-NEXT: sw $16, 52($sp) # 4-byte Folded Spill
+; MIPS32-NEXT: addiu $sp, $sp, -80
+; MIPS32-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill
+; MIPS32-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill
+; MIPS32-NEXT: sw $18, 68($sp) # 4-byte Folded Spill
+; MIPS32-NEXT: sw $17, 64($sp) # 4-byte Folded Spill
+; MIPS32-NEXT: sw $16, 60($sp) # 4-byte Folded Spill
; MIPS32-NEXT: move $fp, $sp
; MIPS32-NEXT: addiu $1, $zero, -16
; MIPS32-NEXT: and $sp, $sp, $1
@@ -1415,44 +1504,79 @@ define void @call_v7i18(ptr %p) nounwind {
; MIPS32-NEXT: srl $4, $3, 18
; MIPS32-NEXT: or $2, $4, $2
; MIPS32-NEXT: srl $4, $1, 4
-; MIPS32-NEXT: lui $5, 3
-; MIPS32-NEXT: ori $7, $5, 65535
-; MIPS32-NEXT: and $2, $2, $7
-; MIPS32-NEXT: and $4, $4, $7
-; MIPS32-NEXT: and $3, $3, $7
-; MIPS32-NEXT: lw $8, 4($16)
-; MIPS32-NEXT: lw $9, 0($16)
-; MIPS32-NEXT: sll $5, $9, 6
-; MIPS32-NEXT: srl $6, $8, 26
+; MIPS32-NEXT: lui $18, 3
+; MIPS32-NEXT: ori $17, $18, 65535
+; MIPS32-NEXT: and $2, $2, $17
+; MIPS32-NEXT: and $4, $4, $17
+; MIPS32-NEXT: and $3, $3, $17
+; MIPS32-NEXT: lw $7, 4($16)
+; MIPS32-NEXT: lw $8, 0($16)
+; MIPS32-NEXT: sll $5, $8, 6
+; MIPS32-NEXT: srl $6, $7, 26
; MIPS32-NEXT: sw $3, 24($sp)
; MIPS32-NEXT: sw $4, 16($sp)
; MIPS32-NEXT: sw $2, 20($sp)
; MIPS32-NEXT: or $2, $6, $5
-; MIPS32-NEXT: srl $3, $8, 8
-; MIPS32-NEXT: and $6, $3, $7
-; MIPS32-NEXT: and $5, $2, $7
-; MIPS32-NEXT: sll $2, $8, 10
+; MIPS32-NEXT: srl $3, $7, 8
+; MIPS32-NEXT: and $6, $3, $17
+; MIPS32-NEXT: and $5, $2, $17
+; MIPS32-NEXT: sll $2, $7, 10
; MIPS32-NEXT: srl $1, $1, 22
; MIPS32-NEXT: or $1, $1, $2
-; MIPS32-NEXT: and $7, $1, $7
+; MIPS32-NEXT: and $7, $1, $17
; MIPS32-NEXT: jal arg_v7i18
-; MIPS32-NEXT: srl $4, $9, 12
+; MIPS32-NEXT: srl $4, $8, 12
; MIPS32-NEXT: jal ret_v7i18
; MIPS32-NEXT: addiu $4, $sp, 32
-; MIPS32-NEXT: lw $1, 32($sp)
-; MIPS32-NEXT: sw $1, 0($16)
-; MIPS32-NEXT: lw $1, 36($sp)
+; MIPS32-NEXT: ori $1, $18, 64512
+; MIPS32-NEXT: ori $2, $18, 65472
+; MIPS32-NEXT: ori $3, $18, 49152
+; MIPS32-NEXT: lw $4, 36($sp)
+; MIPS32-NEXT: srl $5, $4, 8
+; MIPS32-NEXT: and $5, $5, $17
+; MIPS32-NEXT: sll $5, $5, 8
+; MIPS32-NEXT: lw $6, 32($sp)
+; MIPS32-NEXT: sll $7, $6, 6
+; MIPS32-NEXT: and $2, $7, $2
+; MIPS32-NEXT: srl $7, $4, 26
+; MIPS32-NEXT: sll $7, $7, 26
+; MIPS32-NEXT: srl $2, $2, 6
+; MIPS32-NEXT: or $5, $7, $5
+; MIPS32-NEXT: sll $4, $4, 10
+; MIPS32-NEXT: and $1, $4, $1
+; MIPS32-NEXT: srl $1, $1, 10
+; MIPS32-NEXT: srl $4, $6, 12
+; MIPS32-NEXT: sll $4, $4, 12
+; MIPS32-NEXT: lw $6, 40($sp)
+; MIPS32-NEXT: srl $7, $6, 4
+; MIPS32-NEXT: or $1, $5, $1
+; MIPS32-NEXT: and $5, $7, $17
+; MIPS32-NEXT: or $2, $4, $2
+; MIPS32-NEXT: lw $4, 44($sp)
+; MIPS32-NEXT: and $7, $4, $17
+; MIPS32-NEXT: srl $4, $4, 18
+; MIPS32-NEXT: sll $4, $4, 18
+; MIPS32-NEXT: or $4, $4, $7
+; MIPS32-NEXT: sw $4, 12($16)
+; MIPS32-NEXT: sw $2, 0($16)
; MIPS32-NEXT: sw $1, 4($16)
-; MIPS32-NEXT: lw $1, 40($sp)
+; MIPS32-NEXT: sll $1, $5, 4
+; MIPS32-NEXT: srl $2, $6, 22
+; MIPS32-NEXT: sll $2, $2, 22
+; MIPS32-NEXT: or $1, $2, $1
+; MIPS32-NEXT: sll $2, $6, 14
+; MIPS32-NEXT: and $2, $2, $3
+; MIPS32-NEXT: srl $2, $2, 14
+; MIPS32-NEXT: or $1, $1, $2
; MIPS32-NEXT: sw $1, 8($16)
-; MIPS32-NEXT: lw $1, 44($sp)
-; MIPS32-NEXT: sw $1, 12($16)
; MIPS32-NEXT: move $sp, $fp
-; MIPS32-NEXT: lw $16, 52($sp) # 4-byte Folded Reload
-; MIPS32-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload
-; MIPS32-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload
+; MIPS32-NEXT: lw $16, 60($sp) # 4-byte Folded Reload
+; MIPS32-NEXT: lw $17, 64($sp) # 4-byte Folded Reload
+; MIPS32-NEXT: lw $18, 68($sp) # 4-byte Folded Reload
+; MIPS32-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload
+; MIPS32-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload
; MIPS32-NEXT: jr $ra
-; MIPS32-NEXT: addiu $sp, $sp, 64
+; MIPS32-NEXT: addiu $sp, $sp, 80
%v1 = load <7 x i18>, ptr %p
call void @arg_v7i18(<7 x i18> %v1)
%v2 = call <7 x i18> @ret_v7i18()
diff --git a/llvm/test/CodeGen/Mips/cins.ll b/llvm/test/CodeGen/Mips/cins.ll
index d00138a3ce37a7..d1f4322965a775 100644
--- a/llvm/test/CodeGen/Mips/cins.ll
+++ b/llvm/test/CodeGen/Mips/cins.ll
@@ -97,8 +97,9 @@ entry:
define i64 @cins_shl_and32(i32 signext %n) {
; CHECK-LABEL: cins_shl_and32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andi $1, $4, 65535
; CHECK-NEXT: jr $ra
-; CHECK-NEXT: cins32 $2, $4, 15, 15
+; CHECK-NEXT: dsll $2, $1, 47
entry:
%and = and i32 %n, 65535
%conv = zext i32 %and to i64
diff --git a/llvm/test/CodeGen/Mips/dins.ll b/llvm/test/CodeGen/Mips/dins.ll
index 4deb7455a80128..e90611488b006a 100644
--- a/llvm/test/CodeGen/Mips/dins.ll
+++ b/llvm/test/CodeGen/Mips/dins.ll
@@ -45,13 +45,17 @@ define i64 @f123(i64 inreg %bufptr.coerce0, i64 inreg %bufptr.coerce1) local_unn
; MIPS64R2-NEXT: daddiu $1, $zero, 4
; MIPS64R2-NEXT: ld $2, 0($sp)
; MIPS64R2-NEXT: dinsm $2, $1, 28, 6
-; MIPS64R2-NEXT: daddiu $1, $zero, 5
+; MIPS64R2-NEXT: daddiu $1, $zero, 16383
+; MIPS64R2-NEXT: dsll $1, $1, 34
+; MIPS64R2-NEXT: daddiu $3, $zero, 5
; MIPS64R2-NEXT: sd $2, 0($sp)
; MIPS64R2-NEXT: ld $2, 0($sp)
-; MIPS64R2-NEXT: dinsu $2, $1, 50, 14
+; MIPS64R2-NEXT: dinsu $2, $3, 50, 14
; MIPS64R2-NEXT: sd $2, 0($sp)
-; MIPS64R2-NEXT: ld $1, 0($sp)
-; MIPS64R2-NEXT: dsrl $1, $1, 50
+; MIPS64R2-NEXT: ld $2, 0($sp)
+; MIPS64R2-NEXT: dsrl $2, $2, 16
+; MIPS64R2-NEXT: and $1, $2, $1
+; MIPS64R2-NEXT: dsrl $1, $1, 34
; MIPS64R2-NEXT: ld $2, 0($sp)
; MIPS64R2-NEXT: dinsu $2, $1, 34, 16
; MIPS64R2-NEXT: sd $2, 0($sp)
@@ -93,7 +97,9 @@ define i64 @f123(i64 inreg %bufptr.coerce0, i64 inreg %bufptr.coerce1) local_unn
; MIPS32R2-NEXT: lw $2, 0($sp)
; MIPS32R2-NEXT: lw $3, 4($sp)
; MIPS32R2-NEXT: sw $3, 4($sp)
-; MIPS32R2-NEXT: srl $1, $1, 18
+; MIPS32R2-NEXT: srl $1, $1, 16
+; MIPS32R2-NEXT: andi $1, $1, 65532
+; MIPS32R2-NEXT: srl $1, $1, 2
; MIPS32R2-NEXT: ins $2, $1, 2, 16
; MIPS32R2-NEXT: sw $2, 0($sp)
; MIPS32R2-NEXT: lw $1, 8($sp)
@@ -189,13 +195,17 @@ define i64 @f123(i64 inreg %bufptr.coerce0, i64 inreg %bufptr.coerce1) local_unn
; MIPS64R2N32-NEXT: daddiu $1, $zero, 4
; MIPS64R2N32-NEXT: ld $2, 0($sp)
; MIPS64R2N32-NEXT: dinsm $2, $1, 28, 6
-; MIPS64R2N32-NEXT: daddiu $1, $zero, 5
+; MIPS64R2N32-NEXT: daddiu $1, $zero, 16383
+; MIPS64R2N32-NEXT: dsll $1, $1, 34
+; MIPS64R2N32-NEXT: daddiu $3, $zero, 5
; MIPS64R2N32-NEXT: sd $2, 0($sp)
; MIPS64R2N32-NEXT: ld $2, 0($sp)
-; MIPS64R2N32-NEXT: dinsu $2, $1, 50, 14
+; MIPS64R2N32-NEXT: dinsu $2, $3, 50, 14
; MIPS64R2N32-NEXT: sd $2, 0($sp)
-; MIPS64R2N32-NEXT: ld $1, 0($sp)
-; MIPS64R2N32-NEXT: dsrl $1, $1, 50
+; MIPS64R2N32-NEXT: ld $2, 0($sp)
+; MIPS64R2N32-NEXT: dsrl $2, $2, 16
+; MIPS64R2N32-NEXT: and $1, $2, $1
+; MIPS64R2N32-NEXT: dsrl $1, $1, 34
; MIPS64R2N32-NEXT: ld $2, 0($sp)
; MIPS64R2N32-NEXT: dinsu $2, $1, 34, 16
; MIPS64R2N32-NEXT: sd $2, 0($sp)
diff --git a/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll b/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll
index cc2c674f89586b..afa987773e22b9 100644
--- a/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll
+++ b/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll
@@ -22,10 +22,9 @@ define float @func2(float %d, double %f) nounwind readnone {
; 64-NEXT: add.s $f0, $f12, $f0
; 64-NEXT: mfc1 $1, $f0
; 64-NEXT: dmfc1 $2, $f13
-; 64-NEXT: lui $3, 32767
-; 64-NEXT: ori $3, $3, 65535
-; 64-NEXT: and $1, $1, $3
+; 64-NEXT: sll $1, $1, 1
; 64-NEXT: dsrl $2, $2, 63
+; 64-NEXT: srl $1, $1, 1
; 64-NEXT: sll $2, $2, 0
; 64-NEXT: sll $2, $2, 31
; 64-NEXT: or $1, $1, $2
@@ -61,21 +60,19 @@ entry:
define double @func3(double %d, float %f) nounwind readnone {
; 64-LABEL: func3:
; 64: # %bb.0: # %entry
-; 64-NEXT: lui $1, %highest(.LCPI1_0)
-; 64-NEXT: daddiu $1, $1, %higher(.LCPI1_0)
-; 64-NEXT: dsll $1, $1, 16
-; 64-NEXT: daddiu $1, $1, %hi(.LCPI1_0)
-; 64-NEXT: dsll $1, $1, 16
-; 64-NEXT: ldc1 $f0, %lo(.LCPI1_0)($1)
-; 64-NEXT: add.d $f0, $f12, $f0
; 64-NEXT: mfc1 $1, $f13
-; 64-NEXT: daddiu $2, $zero, 1
-; 64-NEXT: dmfc1 $3, $f0
-; 64-NEXT: dsll $2, $2, 63
-; 64-NEXT: daddiu $2, $2, -1
-; 64-NEXT: and $2, $3, $2
; 64-NEXT: srl $1, $1, 31
+; 64-NEXT: lui $2, %highest(.LCPI1_0)
+; 64-NEXT: daddiu $2, $2, %higher(.LCPI1_0)
+; 64-NEXT: dsll $2, $2, 16
+; 64-NEXT: daddiu $2, $2, %hi(.LCPI1_0)
; 64-NEXT: dsll $1, $1, 63
+; 64-NEXT: dsll $2, $2, 16
+; 64-NEXT: ldc1 $f0, %lo(.LCPI1_0)($2)
+; 64-NEXT: add.d $f0, $f12, $f0
+; 64-NEXT: dmfc1 $2, $f0
+; 64-NEXT: dsll $2, $2, 1
+; 64-NEXT: dsrl $2, $2, 1
; 64-NEXT: or $1, $2, $1
; 64-NEXT: jr $ra
; 64-NEXT: dmtc1 $1, $f0
diff --git a/llvm/test/CodeGen/Mips/fcopysign.ll b/llvm/test/CodeGen/Mips/fcopysign.ll
index 167354aaf085a4..409044c77bb95b 100644
--- a/llvm/test/CodeGen/Mips/fcopysign.ll
+++ b/llvm/test/CodeGen/Mips/fcopysign.ll
@@ -14,12 +14,11 @@ define double @func0(double %d0, double %d1) nounwind readnone {
; 32-LABEL: func0:
; 32: # %bb.0: # %entry
; 32-NEXT: mfc1 $1, $f15
-; 32-NEXT: lui $2, 32768
-; 32-NEXT: and $1, $1, $2
-; 32-NEXT: lui $2, 32767
-; 32-NEXT: ori $2, $2, 65535
-; 32-NEXT: mfc1 $3, $f13
-; 32-NEXT: and $2, $3, $2
+; 32-NEXT: srl $1, $1, 31
+; 32-NEXT: sll $1, $1, 31
+; 32-NEXT: mfc1 $2, $f13
+; 32-NEXT: sll $2, $2, 1
+; 32-NEXT: srl $2, $2, 1
; 32-NEXT: or $1, $2, $1
; 32-NEXT: mfc1 $2, $f12
; 32-NEXT: mtc1 $2, $f0
@@ -40,14 +39,13 @@ define double @func0(double %d0, double %d1) nounwind readnone {
;
; 64-LABEL: func0:
; 64: # %bb.0: # %entry
-; 64-NEXT: daddiu $1, $zero, 1
+; 64-NEXT: dmfc1 $1, $f13
+; 64-NEXT: dsrl $1, $1, 63
; 64-NEXT: dsll $1, $1, 63
-; 64-NEXT: dmfc1 $2, $f13
-; 64-NEXT: and $2, $2, $1
-; 64-NEXT: dmfc1 $3, $f12
-; 64-NEXT: daddiu $1, $1, -1
-; 64-NEXT: and $1, $3, $1
-; 64-NEXT: or $1, $1, $2
+; 64-NEXT: dmfc1 $2, $f12
+; 64-NEXT: dsll $2, $2, 1
+; 64-NEXT: dsrl $2, $2, 1
+; 64-NEXT: or $1, $2, $1
; 64-NEXT: jr $ra
; 64-NEXT: dmtc1 $1, $f0
;
@@ -74,12 +72,11 @@ define float @func1(float %f0, float %f1) nounwind readnone {
; 32-LABEL: func1:
; 32: # %bb.0: # %entry
; 32-NEXT: mfc1 $1, $f14
-; 32-NEXT: lui $2, 32768
-; 32-NEXT: and $1, $1, $2
-; 32-NEXT: lui $2, 32767
-; 32-NEXT: ori $2, $2, 65535
-; 32-NEXT: mfc1 $3, $f12
-; 32-NEXT: and $2, $3, $2
+; 32-NEXT: srl $1, $1, 31
+; 32-NEXT: sll $1, $1, 31
+; 32-NEXT: mfc1 $2, $f12
+; 32-NEXT: sll $2, $2, 1
+; 32-NEXT: srl $2, $2, 1
; 32-NEXT: or $1, $2, $1
; 32-NEXT: jr $ra
; 32-NEXT: mtc1 $1, $f0
@@ -96,12 +93,11 @@ define float @func1(float %f0, float %f1) nounwind readnone {
; 64-LABEL: func1:
; 64: # %bb.0: # %entry
; 64-NEXT: mfc1 $1, $f13
-; 64-NEXT: lui $2, 32768
-; 64-NEXT: and $1, $1, $2
-; 64-NEXT: lui $2, 32767
-; 64-NEXT: ori $2, $2, 65535
-; 64-NEXT: mfc1 $3, $f12
-; 64-NEXT: and $2, $3, $2
+; 64-NEXT: srl $1, $1, 31
+; 64-NEXT: sll $1, $1, 31
+; 64-NEXT: mfc1 $2, $f12
+; 64-NEXT: sll $2, $2, 1
+; 64-NEXT: srl $2, $2, 1
; 64-NEXT: or $1, $2, $1
; 64-NEXT: jr $ra
; 64-NEXT: mtc1 $1, $f0
diff --git a/llvm/test/CodeGen/Mips/funnel-shift-rot.ll b/llvm/test/CodeGen/Mips/funnel-shift-rot.ll
index ee187678949e95..d4194384c95acc 100644
--- a/llvm/test/CodeGen/Mips/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/Mips/funnel-shift-rot.ll
@@ -74,8 +74,8 @@ define i32 @rotl_i32(i32 %x, i32 %z) {
define i64 @rotl_i64(i64 %x, i64 %z) {
; CHECK-BE-LABEL: rotl_i64:
; CHECK-BE: # %bb.0:
-; CHECK-BE-NEXT: srl $1, $7, 5
-; CHECK-BE-NEXT: andi $1, $1, 1
+; CHECK-BE-NEXT: andi $1, $7, 32
+; CHECK-BE-NEXT: srl $1, $1, 5
; CHECK-BE-NEXT: move $3, $4
; CHECK-BE-NEXT: movn $3, $5, $1
; CHECK-BE-NEXT: sllv $2, $3, $7
@@ -92,8 +92,8 @@ define i64 @rotl_i64(i64 %x, i64 %z) {
;
; CHECK-LE-LABEL: rotl_i64:
; CHECK-LE: # %bb.0:
-; CHECK-LE-NEXT: srl $1, $6, 5
-; CHECK-LE-NEXT: andi $1, $1, 1
+; CHECK-LE-NEXT: andi $1, $6, 32
+; CHECK-LE-NEXT: srl $1, $1, 5
; CHECK-LE-NEXT: move $3, $4
; CHECK-LE-NEXT: movn $3, $5, $1
; CHECK-LE-NEXT: sllv $2, $3, $6
diff --git a/llvm/test/CodeGen/Mips/funnel-shift.ll b/llvm/test/CodeGen/Mips/funnel-shift.ll
index bda2b477b52f30..361235faff3717 100644
--- a/llvm/test/CodeGen/Mips/funnel-shift.ll
+++ b/llvm/test/CodeGen/Mips/funnel-shift.ll
@@ -70,8 +70,8 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-BE-NEXT: addiu $6, $zero, 0
; CHECK-BE-NEXT: jal __umoddi3
; CHECK-BE-NEXT: addiu $7, $zero, 37
-; CHECK-BE-NEXT: srl $1, $3, 5
-; CHECK-BE-NEXT: andi $1, $1, 1
+; CHECK-BE-NEXT: andi $1, $3, 32
+; CHECK-BE-NEXT: srl $1, $1, 5
; CHECK-BE-NEXT: movn $19, $18, $1
; CHECK-BE-NEXT: sllv $2, $19, $3
; CHECK-BE-NEXT: not $4, $3
@@ -120,8 +120,8 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-LE-NEXT: addiu $6, $zero, 37
; CHECK-LE-NEXT: jal __umoddi3
; CHECK-LE-NEXT: addiu $7, $zero, 0
-; CHECK-LE-NEXT: srl $1, $2, 5
-; CHECK-LE-NEXT: andi $3, $1, 1
+; CHECK-LE-NEXT: andi $1, $2, 32
+; CHECK-LE-NEXT: srl $3, $1, 5
; CHECK-LE-NEXT: srl $1, $17, 5
; CHECK-LE-NEXT: sll $4, $16, 27
; CHECK-LE-NEXT: or $1, $4, $1
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/abs.ll b/llvm/test/CodeGen/Mips/llvm-ir/abs.ll
index ea0e34fb2b0856..9120390d581993 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/abs.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/abs.ll
@@ -8,57 +8,54 @@
define float @abs_s(float %a) {
; MIPS32-LABEL: abs_s:
; MIPS32: # %bb.0:
-; MIPS32-NEXT: lui $1, 32767 # <MCInst #[[#MCINST1:]] LUi
+; MIPS32-NEXT: mfc1 $1, $f12 # <MCInst #[[#MCINST1:]] MFC1
; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1:]]>
-; MIPS32-NEXT: # <MCOperand Imm:32767>>
-; MIPS32-NEXT: ori $1, $1, 65535 # <MCInst #[[#MCINST2:]] ORi
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32-NEXT: sll $1, $1, 1 # <MCInst #[[#MCINST2:]] SLL
; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
-; MIPS32-NEXT: # <MCOperand Imm:65535>>
-; MIPS32-NEXT: mfc1 $2, $f12 # <MCInst #[[#MCINST3:]] MFC1
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2:]]>
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
-; MIPS32-NEXT: and $1, $2, $1 # <MCInst #[[#MCINST4:]] AND
+; MIPS32-NEXT: # <MCOperand Imm:1>>
+; MIPS32-NEXT: srl $1, $1, 1 # <MCInst #[[#MCINST3:]] SRL
; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
-; MIPS32-NEXT: jr $ra # <MCInst #[[#MCINST5:]] JR
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
-; MIPS32-NEXT: mtc1 $1, $f0 # <MCInst #[[#MCINST6:]] MTC1
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Imm:1>>
+; MIPS32-NEXT: jr $ra # <MCInst #[[#MCINST4:]] JR
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+; MIPS32-NEXT: mtc1 $1, $f0 # <MCInst #[[#MCINST5:]] MTC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG4:]]>
; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
;
; MIPS32FP64-LABEL: abs_s:
; MIPS32FP64: # %bb.0:
-; MIPS32FP64-NEXT: jr $ra # <MCInst #[[#MCINST5:]] JR
-; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
-; MIPS32FP64-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST7:]] FABS_S
-; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS32FP64-NEXT: jr $ra # <MCInst #[[#MCINST4:]] JR
; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+; MIPS32FP64-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST6:]] FABS_S
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG4:]]>
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG2:]]>>
;
; MM-LABEL: abs_s:
; MM: # %bb.0:
-; MM-NEXT: jr $ra # <MCInst #[[#MCINST8:]] JR_MM
-; MM-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
-; MM-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST9:]] FABS_S_MM
-; MM-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MM-NEXT: jr $ra # <MCInst #[[#MCINST7:]] JR_MM
; MM-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+; MM-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST8:]] FABS_S_MM
+; MM-NEXT: # <MCOperand Reg:[[#MCREG4:]]>
+; MM-NEXT: # <MCOperand Reg:[[#MCREG2:]]>>
;
; MMFP64-LABEL: abs_s:
; MMFP64: # %bb.0:
-; MMFP64-NEXT: jr $ra # <MCInst #[[#MCINST8:]] JR_MM
-; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
-; MMFP64-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST9:]] FABS_S_MM
-; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MMFP64-NEXT: jr $ra # <MCInst #[[#MCINST7:]] JR_MM
; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+; MMFP64-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST8:]] FABS_S_MM
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG4:]]>
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG2:]]>>
;
; MMR6-LABEL: abs_s:
; MMR6: # %bb.0:
-; MMR6-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST9:]] FABS_S_MM
-; MMR6-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MMR6-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST8:]] FABS_S_MM
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG4:]]>
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR6-NEXT: jrc $ra # <MCInst #[[#MCINST9:]] JRC16_MM
; MMR6-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
-; MMR6-NEXT: jrc $ra # <MCInst #[[#MCINST10:]] JRC16_MM
-; MMR6-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
%ret = call float @llvm.fabs.f32(float %a)
ret float %ret
}
@@ -66,63 +63,60 @@ define float @abs_s(float %a) {
define double @abs_d(double %a) {
; MIPS32-LABEL: abs_d:
; MIPS32: # %bb.0:
-; MIPS32-NEXT: lui $1, 32767 # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT: mfc1 $1, $f12 # <MCInst #[[#MCINST1]] MFC1
; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
-; MIPS32-NEXT: # <MCOperand Imm:32767>>
-; MIPS32-NEXT: ori $1, $1, 65535 # <MCInst #[[#MCINST2]] ORi
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
-; MIPS32-NEXT: # <MCOperand Imm:65535>>
-; MIPS32-NEXT: mfc1 $2, $f13 # <MCInst #[[#MCINST3]] MFC1
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT: mfc1 $2, $f13 # <MCInst #[[#MCINST1]] MFC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG6:]]>>
-; MIPS32-NEXT: and $1, $2, $1 # <MCInst #[[#MCINST4]] AND
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: sll $2, $2, 1 # <MCInst #[[#MCINST2]] SLL
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5]]>
+; MIPS32-NEXT: # <MCOperand Imm:1>>
+; MIPS32-NEXT: srl $2, $2, 1 # <MCInst #[[#MCINST3]] SRL
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5]]>
+; MIPS32-NEXT: # <MCOperand Imm:1>>
+; MIPS32-NEXT: mtc1 $1, $f0 # <MCInst #[[#MCINST5]] MTC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG4]]>
; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
-; MIPS32-NEXT: mfc1 $2, $f12 # <MCInst #[[#MCINST3]] MFC1
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: jr $ra # <MCInst #[[#MCINST4]] JR
; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG3]]>>
-; MIPS32-NEXT: mtc1 $2, $f0 # <MCInst #[[#MCINST6]] MTC1
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5]]>
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>>
-; MIPS32-NEXT: jr $ra # <MCInst #[[#MCINST5]] JR
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
-; MIPS32-NEXT: mtc1 $1, $f1 # <MCInst #[[#MCINST6]] MTC1
+; MIPS32-NEXT: mtc1 $2, $f1 # <MCInst #[[#MCINST5]] MTC1
; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG7:]]>
-; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5]]>>
;
; MIPS32FP64-LABEL: abs_d:
; MIPS32FP64: # %bb.0:
-; MIPS32FP64-NEXT: jr $ra # <MCInst #[[#MCINST5]] JR
-; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
-; MIPS32FP64-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST11:]] FABS_D64
+; MIPS32FP64-NEXT: jr $ra # <MCInst #[[#MCINST4]] JR
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG3]]>>
+; MIPS32FP64-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST10:]] FABS_D64
; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG8:]]>
; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG9:]]>>
;
; MM-LABEL: abs_d:
; MM: # %bb.0:
-; MM-NEXT: jr $ra # <MCInst #[[#MCINST8]] JR_MM
-; MM-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
-; MM-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST12:]] FABS_D32_MM
+; MM-NEXT: jr $ra # <MCInst #[[#MCINST7]] JR_MM
+; MM-NEXT: # <MCOperand Reg:[[#MCREG3]]>>
+; MM-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST11:]] FABS_D32_MM
; MM-NEXT: # <MCOperand Reg:[[#MCREG10:]]>
; MM-NEXT: # <MCOperand Reg:[[#MCREG11:]]>>
;
; MMFP64-LABEL: abs_d:
; MMFP64: # %bb.0:
-; MMFP64-NEXT: jr $ra # <MCInst #[[#MCINST8]] JR_MM
-; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
-; MMFP64-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST13:]] FABS_D64_MM
+; MMFP64-NEXT: jr $ra # <MCInst #[[#MCINST7]] JR_MM
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG3]]>>
+; MMFP64-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST12:]] FABS_D64_MM
; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG8:]]>
; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG9:]]>>
;
; MMR6-LABEL: abs_d:
; MMR6: # %bb.0:
-; MMR6-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST13:]] FABS_D64_MM
+; MMR6-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST12:]] FABS_D64_MM
; MMR6-NEXT: # <MCOperand Reg:[[#MCREG8:]]>
; MMR6-NEXT: # <MCOperand Reg:[[#MCREG9:]]>>
-; MMR6-NEXT: jrc $ra # <MCInst #[[#MCINST10]] JRC16_MM
-; MMR6-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
+; MMR6-NEXT: jrc $ra # <MCInst #[[#MCINST9]] JRC16_MM
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG3]]>>
%ret = call double @llvm.fabs.f64(double %a)
ret double %ret
}
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/nan-fp-attr.ll b/llvm/test/CodeGen/Mips/llvm-ir/nan-fp-attr.ll
index 918cda6b38c265..02064f822aa4e1 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/nan-fp-attr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/nan-fp-attr.ll
@@ -96,14 +96,13 @@ entry:
define dso_local double @bar(double %a) {
; MIPS32R1-LABEL: bar:
; MIPS32R1: # %bb.0: # %entry
-; MIPS32R1-NEXT: lui $1, 32767
-; MIPS32R1-NEXT: ori $1, $1, 65535
+; MIPS32R1-NEXT: mfc1 $1, $f12
; MIPS32R1-NEXT: mfc1 $2, $f13
-; MIPS32R1-NEXT: and $1, $2, $1
-; MIPS32R1-NEXT: mfc1 $2, $f12
-; MIPS32R1-NEXT: mtc1 $2, $f0
+; MIPS32R1-NEXT: sll $2, $2, 1
+; MIPS32R1-NEXT: srl $2, $2, 1
+; MIPS32R1-NEXT: mtc1 $1, $f0
; MIPS32R1-NEXT: jr $ra
-; MIPS32R1-NEXT: mtc1 $1, $f1
+; MIPS32R1-NEXT: mtc1 $2, $f1
;
; MIPS32R2-LABEL: bar:
; MIPS32R2: # %bb.0: # %entry
@@ -128,10 +127,8 @@ define dso_local double @bar(double %a) {
; MIPS64R1-LABEL: bar:
; MIPS64R1: # %bb.0: # %entry
; MIPS64R1-NEXT: dmfc1 $1, $f12
-; MIPS64R1-NEXT: daddiu $2, $zero, 1
-; MIPS64R1-NEXT: dsll $2, $2, 63
-; MIPS64R1-NEXT: daddiu $2, $2, -1
-; MIPS64R1-NEXT: and $1, $1, $2
+; MIPS64R1-NEXT: dsll $1, $1, 1
+; MIPS64R1-NEXT: dsrl $1, $1, 1
; MIPS64R1-NEXT: jr $ra
; MIPS64R1-NEXT: dmtc1 $1, $f0
;
@@ -248,10 +245,9 @@ entry:
define dso_local float @bar_2(float %a) {
; MIPS32R1-LABEL: bar_2:
; MIPS32R1: # %bb.0: # %entry
-; MIPS32R1-NEXT: lui $1, 32767
-; MIPS32R1-NEXT: ori $1, $1, 65535
-; MIPS32R1-NEXT: mfc1 $2, $f12
-; MIPS32R1-NEXT: and $1, $2, $1
+; MIPS32R1-NEXT: mfc1 $1, $f12
+; MIPS32R1-NEXT: sll $1, $1, 1
+; MIPS32R1-NEXT: srl $1, $1, 1
; MIPS32R1-NEXT: jr $ra
; MIPS32R1-NEXT: mtc1 $1, $f0
;
@@ -274,10 +270,9 @@ define dso_local float @bar_2(float %a) {
;
; MIPS64R1-LABEL: bar_2:
; MIPS64R1: # %bb.0: # %entry
-; MIPS64R1-NEXT: lui $1, 32767
-; MIPS64R1-NEXT: ori $1, $1, 65535
-; MIPS64R1-NEXT: mfc1 $2, $f12
-; MIPS64R1-NEXT: and $1, $2, $1
+; MIPS64R1-NEXT: mfc1 $1, $f12
+; MIPS64R1-NEXT: sll $1, $1, 1
+; MIPS64R1-NEXT: srl $1, $1, 1
; MIPS64R1-NEXT: jr $ra
; MIPS64R1-NEXT: mtc1 $1, $f0
;
diff --git a/llvm/test/CodeGen/Mips/load-store-left-right.ll b/llvm/test/CodeGen/Mips/load-store-left-right.ll
index 3c3110341df269..e6b8842657d1bd 100644
--- a/llvm/test/CodeGen/Mips/load-store-left-right.ll
+++ b/llvm/test/CodeGen/Mips/load-store-left-right.ll
@@ -424,11 +424,9 @@ define i64 @load_UI() nounwind readonly {
; MIPS64-EL-NEXT: ld $1, %got_disp(sui)($1)
; MIPS64-EL-NEXT: lwl $2, 3($1)
; MIPS64-EL-NEXT: lwr $2, 0($1)
-; MIPS64-EL-NEXT: daddiu $1, $zero, 1
-; MIPS64-EL-NEXT: dsll $1, $1, 32
-; MIPS64-EL-NEXT: daddiu $1, $1, -1
+; MIPS64-EL-NEXT: dsll $1, $2, 32
; MIPS64-EL-NEXT: jr $ra
-; MIPS64-EL-NEXT: and $2, $2, $1
+; MIPS64-EL-NEXT: dsrl $2, $1, 32
;
; MIPS64-EB-LABEL: load_UI:
; MIPS64-EB: # %bb.0: # %entry
@@ -438,11 +436,9 @@ define i64 @load_UI() nounwind readonly {
; MIPS64-EB-NEXT: ld $1, %got_disp(sui)($1)
; MIPS64-EB-NEXT: lwl $2, 0($1)
; MIPS64-EB-NEXT: lwr $2, 3($1)
-; MIPS64-EB-NEXT: daddiu $1, $zero, 1
-; MIPS64-EB-NEXT: dsll $1, $1, 32
-; MIPS64-EB-NEXT: daddiu $1, $1, -1
+; MIPS64-EB-NEXT: dsll $1, $2, 32
; MIPS64-EB-NEXT: jr $ra
-; MIPS64-EB-NEXT: and $2, $2, $1
+; MIPS64-EB-NEXT: dsrl $2, $1, 32
;
; MIPS64R2-EL-LABEL: load_UI:
; MIPS64R2-EL: # %bb.0: # %entry
@@ -452,8 +448,9 @@ define i64 @load_UI() nounwind readonly {
; MIPS64R2-EL-NEXT: ld $1, %got_disp(sui)($1)
; MIPS64R2-EL-NEXT: lwl $2, 3($1)
; MIPS64R2-EL-NEXT: lwr $2, 0($1)
+; MIPS64R2-EL-NEXT: dsll $1, $2, 32
; MIPS64R2-EL-NEXT: jr $ra
-; MIPS64R2-EL-NEXT: dext $2, $2, 0, 32
+; MIPS64R2-EL-NEXT: dsrl $2, $1, 32
;
; MIPS64R2-EB-LABEL: load_UI:
; MIPS64R2-EB: # %bb.0: # %entry
@@ -463,8 +460,9 @@ define i64 @load_UI() nounwind readonly {
; MIPS64R2-EB-NEXT: ld $1, %got_disp(sui)($1)
; MIPS64R2-EB-NEXT: lwl $2, 0($1)
; MIPS64R2-EB-NEXT: lwr $2, 3($1)
+; MIPS64R2-EB-NEXT: dsll $1, $2, 32
; MIPS64R2-EB-NEXT: jr $ra
-; MIPS64R2-EB-NEXT: dext $2, $2, 0, 32
+; MIPS64R2-EB-NEXT: dsrl $2, $1, 32
;
; MIPS64R6-LABEL: load_UI:
; MIPS64R6: # %bb.0: # %entry
@@ -961,12 +959,12 @@ define void @pass_array_byval() nounwind {
; MIPS32-EB-NEXT: addu $gp, $2, $25
; MIPS32-EB-NEXT: lw $1, %got(arr)($gp)
; MIPS32-EB-NEXT: lwl $4, 0($1)
-; MIPS32-EB-NEXT: lbu $2, 5($1)
; MIPS32-EB-NEXT: lwr $4, 3($1)
-; MIPS32-EB-NEXT: sll $2, $2, 16
+; MIPS32-EB-NEXT: lbu $2, 5($1)
; MIPS32-EB-NEXT: lbu $3, 4($1)
-; MIPS32-EB-NEXT: sll $3, $3, 24
+; MIPS32-EB-NEXT: sll $3, $3, 8
; MIPS32-EB-NEXT: or $2, $3, $2
+; MIPS32-EB-NEXT: sll $2, $2, 16
; MIPS32-EB-NEXT: lbu $1, 6($1)
; MIPS32-EB-NEXT: sll $1, $1, 8
; MIPS32-EB-NEXT: lw $25, %call16(extern_func)($gp)
@@ -1031,16 +1029,14 @@ define void @pass_array_byval() nounwind {
; MIPS64-EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(pass_array_byval)))
; MIPS64-EL-NEXT: ld $1, %got_disp(arr)($gp)
; MIPS64-EL-NEXT: lbu $2, 4($1)
-; MIPS64-EL-NEXT: dsll $2, $2, 32
; MIPS64-EL-NEXT: lbu $3, 5($1)
-; MIPS64-EL-NEXT: dsll $3, $3, 40
+; MIPS64-EL-NEXT: dsll $3, $3, 8
; MIPS64-EL-NEXT: or $2, $3, $2
+; MIPS64-EL-NEXT: dsll $2, $2, 32
; MIPS64-EL-NEXT: lwl $3, 3($1)
; MIPS64-EL-NEXT: lwr $3, 0($1)
-; MIPS64-EL-NEXT: daddiu $4, $zero, 1
-; MIPS64-EL-NEXT: dsll $4, $4, 32
-; MIPS64-EL-NEXT: daddiu $4, $4, -1
-; MIPS64-EL-NEXT: and $3, $3, $4
+; MIPS64-EL-NEXT: dsll $3, $3, 32
+; MIPS64-EL-NEXT: dsrl $3, $3, 32
; MIPS64-EL-NEXT: or $2, $3, $2
; MIPS64-EL-NEXT: lbu $1, 6($1)
; MIPS64-EL-NEXT: dsll $1, $1, 48
@@ -1063,15 +1059,15 @@ define void @pass_array_byval() nounwind {
; MIPS64-EB-NEXT: daddu $1, $1, $25
; MIPS64-EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(pass_array_byval)))
; MIPS64-EB-NEXT: ld $1, %got_disp(arr)($gp)
-; MIPS64-EB-NEXT: lbu $2, 5($1)
-; MIPS64-EB-NEXT: dsll $2, $2, 16
-; MIPS64-EB-NEXT: lbu $3, 4($1)
-; MIPS64-EB-NEXT: dsll $3, $3, 24
-; MIPS64-EB-NEXT: or $2, $3, $2
-; MIPS64-EB-NEXT: lwl $3, 0($1)
-; MIPS64-EB-NEXT: lwr $3, 3($1)
-; MIPS64-EB-NEXT: dsll $3, $3, 32
-; MIPS64-EB-NEXT: or $2, $3, $2
+; MIPS64-EB-NEXT: lwl $2, 0($1)
+; MIPS64-EB-NEXT: lwr $2, 3($1)
+; MIPS64-EB-NEXT: dsll $2, $2, 32
+; MIPS64-EB-NEXT: lbu $3, 5($1)
+; MIPS64-EB-NEXT: lbu $4, 4($1)
+; MIPS64-EB-NEXT: dsll $4, $4, 8
+; MIPS64-EB-NEXT: or $3, $4, $3
+; MIPS64-EB-NEXT: dsll $3, $3, 16
+; MIPS64-EB-NEXT: or $2, $2, $3
; MIPS64-EB-NEXT: lbu $1, 6($1)
; MIPS64-EB-NEXT: dsll $1, $1, 8
; MIPS64-EB-NEXT: ld $25, %call16(extern_func)($gp)
@@ -1094,13 +1090,14 @@ define void @pass_array_byval() nounwind {
; MIPS64R2-EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(pass_array_byval)))
; MIPS64R2-EL-NEXT: ld $1, %got_disp(arr)($gp)
; MIPS64R2-EL-NEXT: lbu $2, 4($1)
-; MIPS64R2-EL-NEXT: dsll $2, $2, 32
; MIPS64R2-EL-NEXT: lbu $3, 5($1)
-; MIPS64R2-EL-NEXT: dsll $3, $3, 40
+; MIPS64R2-EL-NEXT: dsll $3, $3, 8
; MIPS64R2-EL-NEXT: or $2, $3, $2
+; MIPS64R2-EL-NEXT: dsll $2, $2, 32
; MIPS64R2-EL-NEXT: lwl $3, 3($1)
; MIPS64R2-EL-NEXT: lwr $3, 0($1)
-; MIPS64R2-EL-NEXT: dext $3, $3, 0, 32
+; MIPS64R2-EL-NEXT: dsll $3, $3, 32
+; MIPS64R2-EL-NEXT: dsrl $3, $3, 32
; MIPS64R2-EL-NEXT: or $2, $3, $2
; MIPS64R2-EL-NEXT: lbu $1, 6($1)
; MIPS64R2-EL-NEXT: dsll $1, $1, 48
@@ -1123,16 +1120,15 @@ define void @pass_array_byval() nounwind {
; MIPS64R2-EB-NEXT: daddu $1, $1, $25
; MIPS64R2-EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(pass_array_byval)))
; MIPS64R2-EB-NEXT: ld $1, %got_disp(arr)($gp)
-; MIPS64R2-EB-NEXT: lbu $2, 5($1)
-; MIPS64R2-EB-NEXT: dsll $2, $2, 16
-; MIPS64R2-EB-NEXT: lbu $3, 4($1)
-; MIPS64R2-EB-NEXT: dsll $3, $3, 24
-; MIPS64R2-EB-NEXT: or $2, $3, $2
-; MIPS64R2-EB-NEXT: lwl $3, 0($1)
-; MIPS64R2-EB-NEXT: lwr $3, 3($1)
-; MIPS64R2-EB-NEXT: dext $3, $3, 0, 32
-; MIPS64R2-EB-NEXT: dsll $3, $3, 32
-; MIPS64R2-EB-NEXT: or $2, $3, $2
+; MIPS64R2-EB-NEXT: lwl $2, 0($1)
+; MIPS64R2-EB-NEXT: lwr $2, 3($1)
+; MIPS64R2-EB-NEXT: dsll $2, $2, 32
+; MIPS64R2-EB-NEXT: lbu $3, 5($1)
+; MIPS64R2-EB-NEXT: lbu $4, 4($1)
+; MIPS64R2-EB-NEXT: dsll $4, $4, 8
+; MIPS64R2-EB-NEXT: or $3, $4, $3
+; MIPS64R2-EB-NEXT: dsll $3, $3, 16
+; MIPS64R2-EB-NEXT: or $2, $2, $3
; MIPS64R2-EB-NEXT: lbu $1, 6($1)
; MIPS64R2-EB-NEXT: dsll $1, $1, 8
; MIPS64R2-EB-NEXT: ld $25, %call16(extern_func)($gp)
diff --git a/llvm/test/CodeGen/Mips/mips64-f128.ll b/llvm/test/CodeGen/Mips/mips64-f128.ll
index ac29154579c500..5219efe6a07243 100644
--- a/llvm/test/CodeGen/Mips/mips64-f128.ll
+++ b/llvm/test/CodeGen/Mips/mips64-f128.ll
@@ -1986,13 +1986,16 @@ define fp128 @libcall2_copysignl() {
; CMP_CC_FMT-NEXT: lui $1, %hi(%neg(%gp_rel(libcall2_copysignl)))
; CMP_CC_FMT-NEXT: daddu $1, $1, $25
; CMP_CC_FMT-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(libcall2_copysignl)))
-; CMP_CC_FMT-NEXT: ld $2, %got_disp(gld0)($1)
-; CMP_CC_FMT-NEXT: ld $4, 8($2)
-; CMP_CC_FMT-NEXT: ld $1, %got_disp(gld1)($1)
-; CMP_CC_FMT-NEXT: ld $1, 8($1)
-; CMP_CC_FMT-NEXT: dsrl $1, $1, 63
-; CMP_CC_FMT-NEXT: dinsu $4, $1, 63, 1
-; CMP_CC_FMT-NEXT: ld $2, 0($2)
+; CMP_CC_FMT-NEXT: daddiu $2, $zero, 1
+; CMP_CC_FMT-NEXT: dsll $2, $2, 63
+; CMP_CC_FMT-NEXT: ld $3, %got_disp(gld1)($1)
+; CMP_CC_FMT-NEXT: ld $3, 8($3)
+; CMP_CC_FMT-NEXT: and $2, $3, $2
+; CMP_CC_FMT-NEXT: dsrl $2, $2, 63
+; CMP_CC_FMT-NEXT: ld $1, %got_disp(gld0)($1)
+; CMP_CC_FMT-NEXT: ld $4, 8($1)
+; CMP_CC_FMT-NEXT: dinsu $4, $2, 63, 1
+; CMP_CC_FMT-NEXT: ld $2, 0($1)
; CMP_CC_FMT-NEXT: jrc $ra
entry:
%0 = load fp128, ptr @gld0, align 16
diff --git a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
index 1a9fa27c263deb..5f3d609f1f7d5c 100644
--- a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
@@ -47,11 +47,11 @@ define i1 @test_srem_even(i4 %X) nounwind {
; MIPSEL-NEXT: sra $1, $1, 28
; MIPSEL-NEXT: sll $2, $1, 1
; MIPSEL-NEXT: addu $1, $2, $1
-; MIPSEL-NEXT: srl $2, $1, 4
-; MIPSEL-NEXT: srl $1, $1, 7
-; MIPSEL-NEXT: andi $1, $1, 1
+; MIPSEL-NEXT: srl $1, $1, 4
+; MIPSEL-NEXT: andi $2, $1, 8
; MIPSEL-NEXT: addiu $3, $zero, 1
-; MIPSEL-NEXT: addu $1, $2, $1
+; MIPSEL-NEXT: srl $2, $2, 3
+; MIPSEL-NEXT: addu $1, $1, $2
; MIPSEL-NEXT: sll $2, $1, 1
; MIPSEL-NEXT: sll $1, $1, 2
; MIPSEL-NEXT: addu $1, $1, $2
@@ -69,10 +69,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
; MIPS64EL-NEXT: sll $3, $2, 1
; MIPS64EL-NEXT: addu $2, $3, $2
; MIPS64EL-NEXT: addiu $3, $zero, 1
-; MIPS64EL-NEXT: srl $4, $2, 4
-; MIPS64EL-NEXT: srl $2, $2, 7
-; MIPS64EL-NEXT: andi $2, $2, 1
-; MIPS64EL-NEXT: addu $2, $4, $2
+; MIPS64EL-NEXT: srl $2, $2, 4
+; MIPS64EL-NEXT: andi $4, $2, 8
+; MIPS64EL-NEXT: srl $4, $4, 3
+; MIPS64EL-NEXT: addu $2, $2, $4
; MIPS64EL-NEXT: sll $4, $2, 1
; MIPS64EL-NEXT: sll $2, $2, 2
; MIPS64EL-NEXT: addu $2, $2, $4
@@ -91,10 +91,12 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; MIPSEL: # %bb.0:
; MIPSEL-NEXT: sll $1, $4, 26
; MIPSEL-NEXT: sra $1, $1, 26
-; MIPSEL-NEXT: srl $1, $1, 9
-; MIPSEL-NEXT: andi $1, $1, 3
+; MIPSEL-NEXT: srl $1, $1, 5
+; MIPSEL-NEXT: andi $1, $1, 48
+; MIPSEL-NEXT: srl $1, $1, 4
; MIPSEL-NEXT: addu $1, $4, $1
-; MIPSEL-NEXT: andi $1, $1, 60
+; MIPSEL-NEXT: srl $1, $1, 2
+; MIPSEL-NEXT: sll $1, $1, 2
; MIPSEL-NEXT: subu $1, $4, $1
; MIPSEL-NEXT: andi $1, $1, 63
; MIPSEL-NEXT: jr $ra
@@ -105,10 +107,12 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; MIPS64EL-NEXT: sll $1, $4, 0
; MIPS64EL-NEXT: sll $2, $1, 26
; MIPS64EL-NEXT: sra $2, $2, 26
-; MIPS64EL-NEXT: srl $2, $2, 9
-; MIPS64EL-NEXT: andi $2, $2, 3
+; MIPS64EL-NEXT: srl $2, $2, 5
+; MIPS64EL-NEXT: andi $2, $2, 48
+; MIPS64EL-NEXT: srl $2, $2, 4
; MIPS64EL-NEXT: addu $2, $1, $2
-; MIPS64EL-NEXT: andi $2, $2, 60
+; MIPS64EL-NEXT: srl $2, $2, 2
+; MIPS64EL-NEXT: sll $2, $2, 2
; MIPS64EL-NEXT: subu $1, $1, $2
; MIPS64EL-NEXT: andi $1, $1, 63
; MIPS64EL-NEXT: jr $ra
diff --git a/llvm/test/CodeGen/Mips/unalignedload.ll b/llvm/test/CodeGen/Mips/unalignedload.ll
index da57b92e8f6df8..8ed11035e6a873 100644
--- a/llvm/test/CodeGen/Mips/unalignedload.ll
+++ b/llvm/test/CodeGen/Mips/unalignedload.ll
@@ -43,14 +43,14 @@ define void @bar1() nounwind {
; MIPS32-EB-NEXT: addu $gp, $2, $25
; MIPS32-EB-NEXT: lw $1, %got(s2)($gp)
; MIPS32-EB-NEXT: lbu $2, 3($1)
-; MIPS32-EB-NEXT: sll $2, $2, 16
; MIPS32-EB-NEXT: lbu $1, 2($1)
-; MIPS32-EB-NEXT: sll $1, $1, 24
+; MIPS32-EB-NEXT: sll $1, $1, 8
+; MIPS32-EB-NEXT: or $1, $1, $2
; MIPS32-EB-NEXT: lw $25, %call16(foo2)($gp)
; MIPS32-EB-NEXT: .reloc ($tmp0), R_MIPS_JALR, foo2
; MIPS32-EB-NEXT: $tmp0:
; MIPS32-EB-NEXT: jalr $25
-; MIPS32-EB-NEXT: or $4, $1, $2
+; MIPS32-EB-NEXT: sll $4, $1, 16
; MIPS32-EB-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload
; MIPS32-EB-NEXT: jr $ra
; MIPS32-EB-NEXT: addiu $sp, $sp, 24
@@ -130,12 +130,12 @@ define void @bar2() nounwind {
; MIPS32-EB-NEXT: addu $gp, $2, $25
; MIPS32-EB-NEXT: lw $1, %got(s4)($gp)
; MIPS32-EB-NEXT: lwl $4, 0($1)
-; MIPS32-EB-NEXT: lbu $2, 5($1)
; MIPS32-EB-NEXT: lwr $4, 3($1)
-; MIPS32-EB-NEXT: sll $2, $2, 16
+; MIPS32-EB-NEXT: lbu $2, 5($1)
; MIPS32-EB-NEXT: lbu $3, 4($1)
-; MIPS32-EB-NEXT: sll $3, $3, 24
+; MIPS32-EB-NEXT: sll $3, $3, 8
; MIPS32-EB-NEXT: or $2, $3, $2
+; MIPS32-EB-NEXT: sll $2, $2, 16
; MIPS32-EB-NEXT: lbu $1, 6($1)
; MIPS32-EB-NEXT: sll $1, $1, 8
; MIPS32-EB-NEXT: lw $25, %call16(foo4)($gp)
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index 9cc45fbe313b7e..1a06c248904848 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -92,8 +92,8 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3;
; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end8
; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4;
-; CHECKPTX62-NEXT: shl.b32 %r38, %r22, 3;
-; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24;
+; CHECKPTX62-NEXT: and.b32 %r38, %r22, 3;
+; CHECKPTX62-NEXT: shl.b32 %r11, %r38, 3;
; CHECKPTX62-NEXT: shl.b32 %r40, %r26, %r11;
; CHECKPTX62-NEXT: not.b32 %r12, %r40;
; CHECKPTX62-NEXT: ld.global.u32 %r56, [%r10];
@@ -112,8 +112,8 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5;
; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end26
; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4;
-; CHECKPTX62-NEXT: shl.b32 %r46, %r23, 3;
-; CHECKPTX62-NEXT: and.b32 %r17, %r46, 24;
+; CHECKPTX62-NEXT: and.b32 %r46, %r23, 3;
+; CHECKPTX62-NEXT: shl.b32 %r17, %r46, 3;
; CHECKPTX62-NEXT: shl.b32 %r48, %r26, %r17;
; CHECKPTX62-NEXT: not.b32 %r18, %r48;
; CHECKPTX62-NEXT: ld.shared.u32 %r57, [%r16];
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 96a4359d0ec43e..3630e45ef53b4a 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -776,20 +776,23 @@ define void @test_ldst_v4i8(ptr %a, ptr %b) {
define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) {
; CHECK-LABEL: test_ldst_v4i8_unaligned(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b32 %r<8>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_unaligned_param_1];
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_unaligned_param_0];
-; CHECK-NEXT: ld.u8 %r1, [%rd1];
-; CHECK-NEXT: ld.u8 %r2, [%rd1+1];
-; CHECK-NEXT: ld.u8 %r3, [%rd1+2];
-; CHECK-NEXT: ld.u8 %r4, [%rd1+3];
-; CHECK-NEXT: st.u8 [%rd2+3], %r4;
-; CHECK-NEXT: st.u8 [%rd2+2], %r3;
-; CHECK-NEXT: st.u8 [%rd2+1], %r2;
-; CHECK-NEXT: st.u8 [%rd2], %r1;
+; CHECK-NEXT: ld.u8 %r1, [%rd1+2];
+; CHECK-NEXT: ld.u8 %r2, [%rd1+3];
+; CHECK-NEXT: shl.b32 %r3, %r2, 8;
+; CHECK-NEXT: or.b32 %r4, %r3, %r1;
+; CHECK-NEXT: ld.u8 %r5, [%rd1];
+; CHECK-NEXT: ld.u8 %r6, [%rd1+1];
+; CHECK-NEXT: st.u8 [%rd2+1], %r6;
+; CHECK-NEXT: st.u8 [%rd2], %r5;
+; CHECK-NEXT: st.u8 [%rd2+3], %r2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 0, 16;
+; CHECK-NEXT: st.u8 [%rd2+2], %r7;
; CHECK-NEXT: ret;
%t1 = load <4 x i8>, ptr %a, align 1
store <4 x i8> %t1, ptr %b, align 1
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 08aa26bd340396..9f60d3349f13d9 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -113,19 +113,19 @@ define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly
;
; CHECK64-LABEL: dynamic_offset(
; CHECK64: {
-; CHECK64-NEXT: .reg .b32 %r<3>;
-; CHECK64-NEXT: .reg .b64 %rd<7>;
+; CHECK64-NEXT: .reg .b32 %r<2>;
+; CHECK64-NEXT: .reg .b64 %rd<8>;
; CHECK64-EMPTY:
; CHECK64-NEXT: // %bb.0: // %bb
; CHECK64-NEXT: ld.param.u64 %rd1, [dynamic_offset_param_0];
; CHECK64-NEXT: mov.b64 %rd2, dynamic_offset_param_1;
; CHECK64-NEXT: mov.u64 %rd3, %rd2;
; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
-; CHECK64-NEXT: ld.param.u32 %r1, [dynamic_offset_param_2];
-; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
-; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
-; CHECK64-NEXT: ld.param.u32 %r2, [%rd6];
-; CHECK64-NEXT: st.global.u32 [%rd4], %r2;
+; CHECK64-NEXT: ld.param.s32 %rd5, [dynamic_offset_param_2];
+; CHECK64-NEXT: shl.b64 %rd6, %rd5, 2;
+; CHECK64-NEXT: add.s64 %rd7, %rd3, %rd6;
+; CHECK64-NEXT: ld.param.u32 %r1, [%rd7];
+; CHECK64-NEXT: st.global.u32 [%rd4], %r1;
; CHECK64-NEXT: ret;
bb:
%tmp = sext i32 %arg2 to i64
@@ -172,18 +172,17 @@ define dso_local void @gep_bitcast(ptr nocapture %out, ptr nocapture readonly b
; CHECK64-LABEL: gep_bitcast(
; CHECK64: {
; CHECK64-NEXT: .reg .b16 %rs<2>;
-; CHECK64-NEXT: .reg .b32 %r<2>;
-; CHECK64-NEXT: .reg .b64 %rd<7>;
+; CHECK64-NEXT: .reg .b64 %rd<8>;
; CHECK64-EMPTY:
; CHECK64-NEXT: // %bb.0: // %bb
; CHECK64-NEXT: ld.param.u64 %rd1, [gep_bitcast_param_0];
; CHECK64-NEXT: mov.b64 %rd2, gep_bitcast_param_1;
; CHECK64-NEXT: mov.u64 %rd3, %rd2;
; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
-; CHECK64-NEXT: ld.param.u32 %r1, [gep_bitcast_param_2];
-; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
-; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
-; CHECK64-NEXT: ld.param.u8 %rs1, [%rd6];
+; CHECK64-NEXT: ld.param.s32 %rd5, [gep_bitcast_param_2];
+; CHECK64-NEXT: shl.b64 %rd6, %rd5, 2;
+; CHECK64-NEXT: add.s64 %rd7, %rd3, %rd6;
+; CHECK64-NEXT: ld.param.u8 %rs1, [%rd7];
; CHECK64-NEXT: st.global.u8 [%rd4], %rs1;
; CHECK64-NEXT: ret;
bb:
@@ -231,18 +230,17 @@ define dso_local void @gep_bitcast_asc(ptr nocapture %out, ptr nocapture readon
; CHECK64-LABEL: gep_bitcast_asc(
; CHECK64: {
; CHECK64-NEXT: .reg .b16 %rs<2>;
-; CHECK64-NEXT: .reg .b32 %r<2>;
-; CHECK64-NEXT: .reg .b64 %rd<7>;
+; CHECK64-NEXT: .reg .b64 %rd<8>;
; CHECK64-EMPTY:
; CHECK64-NEXT: // %bb.0: // %bb
; CHECK64-NEXT: ld.param.u64 %rd1, [gep_bitcast_asc_param_0];
; CHECK64-NEXT: mov.b64 %rd2, gep_bitcast_asc_param_1;
; CHECK64-NEXT: mov.u64 %rd3, %rd2;
; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
-; CHECK64-NEXT: ld.param.u32 %r1, [gep_bitcast_asc_param_2];
-; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
-; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
-; CHECK64-NEXT: ld.param.u8 %rs1, [%rd6];
+; CHECK64-NEXT: ld.param.s32 %rd5, [gep_bitcast_asc_param_2];
+; CHECK64-NEXT: shl.b64 %rd6, %rd5, 2;
+; CHECK64-NEXT: add.s64 %rd7, %rd3, %rd6;
+; CHECK64-NEXT: ld.param.u8 %rs1, [%rd7];
; CHECK64-NEXT: st.global.u8 [%rd4], %rs1;
; CHECK64-NEXT: ret;
bb:
@@ -323,38 +321,38 @@ define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham
; CHECK64-NEXT: .local .align 4 .b8 __local_depot4[16];
; CHECK64-NEXT: .reg .b64 %SP;
; CHECK64-NEXT: .reg .b64 %SPL;
-; CHECK64-NEXT: .reg .b32 %r<7>;
-; CHECK64-NEXT: .reg .b64 %rd<10>;
+; CHECK64-NEXT: .reg .b32 %r<6>;
+; CHECK64-NEXT: .reg .b64 %rd<11>;
; CHECK64-EMPTY:
; CHECK64-NEXT: // %bb.0: // %bb
; CHECK64-NEXT: mov.u64 %SPL, __local_depot4;
; CHECK64-NEXT: ld.param.u64 %rd1, [pointer_escapes_param_0];
; CHECK64-NEXT: add.u64 %rd3, %SPL, 0;
-; CHECK64-NEXT: ld.param.u32 %r1, [pointer_escapes_param_2];
-; CHECK64-NEXT: ld.param.u32 %r2, [pointer_escapes_param_1+12];
-; CHECK64-NEXT: ld.param.u32 %r3, [pointer_escapes_param_1+8];
-; CHECK64-NEXT: ld.param.u32 %r4, [pointer_escapes_param_1+4];
-; CHECK64-NEXT: ld.param.u32 %r5, [pointer_escapes_param_1];
-; CHECK64-NEXT: st.local.u32 [%rd3], %r5;
-; CHECK64-NEXT: st.local.u32 [%rd3+4], %r4;
-; CHECK64-NEXT: st.local.u32 [%rd3+8], %r3;
-; CHECK64-NEXT: st.local.u32 [%rd3+12], %r2;
-; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
-; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
-; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
-; CHECK64-NEXT: cvta.local.u64 %rd7, %rd6;
-; CHECK64-NEXT: ld.local.u32 %r6, [%rd6];
-; CHECK64-NEXT: st.global.u32 [%rd4], %r6;
+; CHECK64-NEXT: ld.param.s32 %rd4, [pointer_escapes_param_2];
+; CHECK64-NEXT: ld.param.u32 %r1, [pointer_escapes_param_1+12];
+; CHECK64-NEXT: ld.param.u32 %r2, [pointer_escapes_param_1+8];
+; CHECK64-NEXT: ld.param.u32 %r3, [pointer_escapes_param_1+4];
+; CHECK64-NEXT: ld.param.u32 %r4, [pointer_escapes_param_1];
+; CHECK64-NEXT: st.local.u32 [%rd3], %r4;
+; CHECK64-NEXT: st.local.u32 [%rd3+4], %r3;
+; CHECK64-NEXT: st.local.u32 [%rd3+8], %r2;
+; CHECK64-NEXT: st.local.u32 [%rd3+12], %r1;
+; CHECK64-NEXT: cvta.to.global.u64 %rd5, %rd1;
+; CHECK64-NEXT: shl.b64 %rd6, %rd4, 2;
+; CHECK64-NEXT: add.s64 %rd7, %rd3, %rd6;
+; CHECK64-NEXT: cvta.local.u64 %rd8, %rd7;
+; CHECK64-NEXT: ld.local.u32 %r5, [%rd7];
+; CHECK64-NEXT: st.global.u32 [%rd5], %r5;
; CHECK64-NEXT: { // callseq 0, 0
; CHECK64-NEXT: .param .b64 param0;
-; CHECK64-NEXT: st.param.b64 [param0+0], %rd7;
+; CHECK64-NEXT: st.param.b64 [param0+0], %rd8;
; CHECK64-NEXT: .param .b64 retval0;
; CHECK64-NEXT: call.uni (retval0),
; CHECK64-NEXT: escape,
; CHECK64-NEXT: (
; CHECK64-NEXT: param0
; CHECK64-NEXT: );
-; CHECK64-NEXT: ld.param.b64 %rd8, [retval0+0];
+; CHECK64-NEXT: ld.param.b64 %rd9, [retval0+0];
; CHECK64-NEXT: } // callseq 0
; CHECK64-NEXT: ret;
bb:
diff --git a/llvm/test/CodeGen/PowerPC/coalesce-ext.ll b/llvm/test/CodeGen/PowerPC/coalesce-ext.ll
index bd726d330dbb7b..1eceb017ef7e4f 100644
--- a/llvm/test/CodeGen/PowerPC/coalesce-ext.ll
+++ b/llvm/test/CodeGen/PowerPC/coalesce-ext.ll
@@ -6,7 +6,8 @@ define i32 @test1sext(i64 %A, i64 %B, ptr %P, ptr %P2) nounwind {
; CHECK-LABEL: test1sext:
; CHECK: # %bb.0:
; CHECK-NEXT: add r4, r3, r4
-; CHECK-NEXT: extsw r3, r4
+; CHECK-NEXT: sldi r3, r4, 32
+; CHECK-NEXT: sradi r3, r3, 32
; CHECK-NEXT: std r3, 0(r6)
; CHECK-NEXT: add r3, r4, r4
; CHECK-NEXT: stw r4, 0(r5)
diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll
index 9f62477ae01df2..626ed70d8b5b85 100644
--- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll
+++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll
@@ -10,8 +10,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %.vy02, ptr %.vy03, ptr %.vy04, ptr %.vy05, ptr %.vy06, ptr %.vy07, ptr %.vy08, ptr %.vy09, ptr %.vy0a, ptr %.vy0b, ptr %.vy0c, ptr %.vy21, ptr %.vy22, ptr %.vy23, ptr %.vy24, ptr %.vy25, ptr %.vy26, ptr %.vy27, ptr %.vy28, ptr %.vy29, ptr %.vy2a, ptr %.vy2b, ptr %.vy2c) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lwz 4, 0(4)
-; CHECK-NEXT: cmpwi 4, 1
+; CHECK-NEXT: lwz 0, 0(4)
+; CHECK-NEXT: cmpwi 0, 1
; CHECK-NEXT: bltlr 0
; CHECK-NEXT: # %bb.1: # %_loop_1_do_.lr.ph
; CHECK-NEXT: lwz 3, 0(3)
@@ -56,138 +56,137 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %.
; CHECK-NEXT: .cfi_offset v29, -240
; CHECK-NEXT: .cfi_offset v30, -224
; CHECK-NEXT: .cfi_offset v31, -208
+; CHECK-NEXT: ld 4, 848(1)
+; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: std 22, 464(1) # 8-byte Folded Spill
; CHECK-NEXT: std 23, 472(1) # 8-byte Folded Spill
-; CHECK-NEXT: mr 22, 5
-; CHECK-NEXT: ld 5, 848(1)
-; CHECK-NEXT: addi 3, 3, 1
-; CHECK-NEXT: mr 11, 7
-; CHECK-NEXT: ld 23, 688(1)
-; CHECK-NEXT: ld 7, 728(1)
-; CHECK-NEXT: std 18, 432(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 19, 440(1) # 8-byte Folded Spill
-; CHECK-NEXT: mr 18, 6
+; CHECK-NEXT: mr 23, 5
+; CHECK-NEXT: lwa 5, 0(7)
+; CHECK-NEXT: ld 11, 712(1)
+; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill
+; CHECK-NEXT: ld 29, 824(1)
+; CHECK-NEXT: ld 22, 776(1)
+; CHECK-NEXT: std 24, 480(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 25, 488(1) # 8-byte Folded Spill
+; CHECK-NEXT: mr 24, 6
; CHECK-NEXT: li 6, 9
-; CHECK-NEXT: ld 19, 768(1)
-; CHECK-NEXT: ld 2, 760(1)
-; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill
+; CHECK-NEXT: ld 12, 816(1)
; CHECK-NEXT: cmpldi 3, 9
-; CHECK-NEXT: ld 27, 816(1)
-; CHECK-NEXT: ld 26, 808(1)
+; CHECK-NEXT: ld 28, 808(1)
; CHECK-NEXT: std 14, 400(1) # 8-byte Folded Spill
; CHECK-NEXT: std 15, 408(1) # 8-byte Folded Spill
-; CHECK-NEXT: ld 15, 736(1)
-; CHECK-NEXT: lxv 39, 0(8)
+; CHECK-NEXT: ld 15, 720(1)
+; CHECK-NEXT: ld 25, 784(1)
; CHECK-NEXT: std 30, 528(1) # 8-byte Folded Spill
; CHECK-NEXT: std 31, 536(1) # 8-byte Folded Spill
-; CHECK-NEXT: ld 30, 704(1)
-; CHECK-NEXT: lxv 38, 0(9)
+; CHECK-NEXT: ld 2, 704(1)
+; CHECK-NEXT: ld 30, 696(1)
; CHECK-NEXT: std 20, 448(1) # 8-byte Folded Spill
; CHECK-NEXT: std 21, 456(1) # 8-byte Folded Spill
-; CHECK-NEXT: ld 21, 784(1)
-; CHECK-NEXT: ld 20, 776(1)
-; CHECK-NEXT: std 24, 480(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 25, 488(1) # 8-byte Folded Spill
-; CHECK-NEXT: iselgt 3, 3, 6
-; CHECK-NEXT: ld 6, 720(1)
-; CHECK-NEXT: ld 24, 792(1)
-; CHECK-NEXT: std 10, 72(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 7, 80(1) # 8-byte Folded Spill
-; CHECK-NEXT: addi 3, 3, -2
-; CHECK-NEXT: lxv 6, 0(19)
-; CHECK-NEXT: lxv 11, 0(7)
-; CHECK-NEXT: std 5, 200(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 23, 40(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 6, 48(1) # 8-byte Folded Spill
-; CHECK-NEXT: ld 5, 840(1)
-; CHECK-NEXT: lxv 12, 0(6)
-; CHECK-NEXT: rldicl 12, 3, 61, 3
-; CHECK-NEXT: std 19, 120(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 20, 128(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 21, 136(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 24, 144(1) # 8-byte Folded Spill
-; CHECK-NEXT: lxv 4, 0(21)
-; CHECK-NEXT: ld 25, 800(1)
-; CHECK-NEXT: lxv 33, 0(10)
-; CHECK-NEXT: lxv 32, 0(23)
-; CHECK-NEXT: lxv 36, 0(30)
+; CHECK-NEXT: ld 20, 760(1)
+; CHECK-NEXT: ld 21, 768(1)
; CHECK-NEXT: std 16, 416(1) # 8-byte Folded Spill
; CHECK-NEXT: std 17, 424(1) # 8-byte Folded Spill
-; CHECK-NEXT: ld 17, 752(1)
-; CHECK-NEXT: ld 16, 744(1)
-; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill
-; CHECK-NEXT: ld 29, 712(1)
-; CHECK-NEXT: ld 28, 696(1)
-; CHECK-NEXT: std 8, 56(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 9, 64(1) # 8-byte Folded Spill
-; CHECK-NEXT: lxv 37, 0(28)
-; CHECK-NEXT: lxv 13, 0(29)
-; CHECK-NEXT: mr 8, 29
-; CHECK-NEXT: mr 9, 30
-; CHECK-NEXT: mr 10, 28
-; CHECK-NEXT: std 25, 152(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 26, 160(1) # 8-byte Folded Spill
-; CHECK-NEXT: lxv 10, 0(15)
-; CHECK-NEXT: lxv 9, 0(16)
-; CHECK-NEXT: li 28, 1
+; CHECK-NEXT: iselgt 3, 3, 6
+; CHECK-NEXT: ld 17, 736(1)
+; CHECK-NEXT: ld 16, 728(1)
+; CHECK-NEXT: std 18, 432(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 19, 440(1) # 8-byte Folded Spill
+; CHECK-NEXT: ld 19, 752(1)
+; CHECK-NEXT: ld 18, 744(1)
+; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill
+; CHECK-NEXT: ld 27, 800(1)
+; CHECK-NEXT: ld 26, 792(1)
+; CHECK-NEXT: std 9, 48(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 10, 56(1) # 8-byte Folded Spill
+; CHECK-NEXT: lxv 33, 0(10)
+; CHECK-NEXT: lxv 39, 0(8)
; CHECK-NEXT: stfd 26, 544(1) # 8-byte Folded Spill
; CHECK-NEXT: stfd 27, 552(1) # 8-byte Folded Spill
-; CHECK-NEXT: lxv 8, 0(17)
-; CHECK-NEXT: lxv 7, 0(2)
+; CHECK-NEXT: sldi 7, 5, 3
+; CHECK-NEXT: addi 3, 3, -2
+; CHECK-NEXT: sldi 14, 5, 4
+; CHECK-NEXT: lxv 13, 0(11)
+; CHECK-NEXT: lxv 7, 0(20)
+; CHECK-NEXT: std 4, 200(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 11, 64(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 15, 72(1) # 8-byte Folded Spill
+; CHECK-NEXT: ld 4, 840(1)
+; CHECK-NEXT: mulli 11, 5, 48
+; CHECK-NEXT: lxv 5, 0(22)
+; CHECK-NEXT: lxv 0, 0(12)
+; CHECK-NEXT: std 20, 112(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 21, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT: lxv 38, 0(9)
+; CHECK-NEXT: rldicl 6, 3, 61, 3
+; CHECK-NEXT: lxv 37, 0(30)
+; CHECK-NEXT: std 22, 128(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 25, 136(1) # 8-byte Folded Spill
+; CHECK-NEXT: lxv 36, 0(2)
+; CHECK-NEXT: lxv 12, 0(15)
+; CHECK-NEXT: std 28, 160(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 12, 168(1) # 8-byte Folded Spill
+; CHECK-NEXT: lxv 11, 0(16)
+; CHECK-NEXT: lxv 10, 0(17)
+; CHECK-NEXT: mr 9, 30
+; CHECK-NEXT: std 26, 144(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 27, 152(1) # 8-byte Folded Spill
+; CHECK-NEXT: lxv 9, 0(18)
+; CHECK-NEXT: lxv 8, 0(19)
; CHECK-NEXT: stfd 28, 560(1) # 8-byte Folded Spill
; CHECK-NEXT: stfd 29, 568(1) # 8-byte Folded Spill
-; CHECK-NEXT: lxv 5, 0(20)
-; CHECK-NEXT: lxv 3, 0(24)
+; CHECK-NEXT: lxv 6, 0(21)
+; CHECK-NEXT: lxv 4, 0(25)
; CHECK-NEXT: stfd 30, 576(1) # 8-byte Folded Spill
; CHECK-NEXT: stfd 31, 584(1) # 8-byte Folded Spill
-; CHECK-NEXT: lxv 2, 0(25)
-; CHECK-NEXT: lxv 1, 0(26)
+; CHECK-NEXT: lxv 3, 0(26)
+; CHECK-NEXT: lxv 2, 0(27)
; CHECK-NEXT: stxv 52, 208(1) # 16-byte Folded Spill
; CHECK-NEXT: stxv 53, 224(1) # 16-byte Folded Spill
-; CHECK-NEXT: lxv 0, 0(27)
+; CHECK-NEXT: lxv 1, 0(28)
+; CHECK-NEXT: lxv 40, 0(29)
+; CHECK-NEXT: li 28, 0
; CHECK-NEXT: stxv 54, 240(1) # 16-byte Folded Spill
; CHECK-NEXT: stxv 55, 256(1) # 16-byte Folded Spill
; CHECK-NEXT: stxv 56, 272(1) # 16-byte Folded Spill
; CHECK-NEXT: stxv 57, 288(1) # 16-byte Folded Spill
+; CHECK-NEXT: std 4, 192(1) # 8-byte Folded Spill
+; CHECK-NEXT: ld 4, 832(1)
; CHECK-NEXT: stxv 58, 304(1) # 16-byte Folded Spill
-; CHECK-NEXT: std 5, 192(1) # 8-byte Folded Spill
-; CHECK-NEXT: ld 5, 832(1)
; CHECK-NEXT: stxv 59, 320(1) # 16-byte Folded Spill
; CHECK-NEXT: stxv 60, 336(1) # 16-byte Folded Spill
; CHECK-NEXT: stxv 61, 352(1) # 16-byte Folded Spill
; CHECK-NEXT: stxv 62, 368(1) # 16-byte Folded Spill
; CHECK-NEXT: stxv 63, 384(1) # 16-byte Folded Spill
-; CHECK-NEXT: std 15, 88(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 16, 96(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 17, 104(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 2, 112(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 5, 184(1) # 8-byte Folded Spill
-; CHECK-NEXT: ld 5, 824(1)
-; CHECK-NEXT: std 5, 176(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 27, 168(1) # 8-byte Folded Spill
-; CHECK-NEXT: lwa 5, 0(11)
-; CHECK-NEXT: li 27, 0
-; CHECK-NEXT: ld 7, 176(1) # 8-byte Folded Reload
+; CHECK-NEXT: std 16, 80(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 17, 88(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 18, 96(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 19, 104(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 4, 184(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 29, 176(1) # 8-byte Folded Spill
+; CHECK-NEXT: ld 4, 688(1)
+; CHECK-NEXT: li 29, 1
+; CHECK-NEXT: ld 12, 184(1) # 8-byte Folded Reload
+; CHECK-NEXT: lxv 41, 0(12)
+; CHECK-NEXT: std 4, 32(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 8, 40(1) # 8-byte Folded Spill
+; CHECK-NEXT: add 4, 7, 23
+; CHECK-NEXT: ld 10, 32(1) # 8-byte Folded Reload
+; CHECK-NEXT: mr 8, 2
+; CHECK-NEXT: addi 31, 4, 32
+; CHECK-NEXT: add 4, 14, 23
+; CHECK-NEXT: mr 27, 23
+; CHECK-NEXT: addi 3, 4, 32
+; CHECK-NEXT: addi 4, 6, 1
; CHECK-NEXT: mulli 6, 5, 40
-; CHECK-NEXT: sldi 0, 5, 4
-; CHECK-NEXT: extswsli 14, 5, 3
-; CHECK-NEXT: lxv 40, 0(7)
-; CHECK-NEXT: ld 7, 184(1) # 8-byte Folded Reload
-; CHECK-NEXT: add 31, 14, 22
-; CHECK-NEXT: add 11, 0, 22
-; CHECK-NEXT: mr 26, 22
-; CHECK-NEXT: addi 3, 11, 32
-; CHECK-NEXT: addi 11, 12, 1
-; CHECK-NEXT: mulli 12, 5, 48
-; CHECK-NEXT: addi 31, 31, 32
-; CHECK-NEXT: add 19, 22, 6
+; CHECK-NEXT: lxv 32, 0(10)
+; CHECK-NEXT: add 20, 23, 6
; CHECK-NEXT: sldi 6, 5, 5
; CHECK-NEXT: mulli 5, 5, 24
-; CHECK-NEXT: lxv 41, 0(7)
-; CHECK-NEXT: add 20, 22, 6
-; CHECK-NEXT: add 21, 22, 5
+; CHECK-NEXT: add 21, 23, 6
+; CHECK-NEXT: add 22, 23, 5
; CHECK-NEXT: ld 5, 192(1) # 8-byte Folded Reload
; CHECK-NEXT: lxv 43, 0(5)
; CHECK-NEXT: ld 5, 200(1) # 8-byte Folded Reload
@@ -196,16 +195,16 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %.
; CHECK-NEXT: .LBB0_3: # %_loop_2_do_.lr.ph
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB0_4 Depth 2
-; CHECK-NEXT: maddld 5, 12, 27, 0
-; CHECK-NEXT: mr 6, 18
-; CHECK-NEXT: mr 29, 21
-; CHECK-NEXT: mr 30, 20
-; CHECK-NEXT: mr 2, 19
-; CHECK-NEXT: mtctr 11
-; CHECK-NEXT: add 25, 22, 5
-; CHECK-NEXT: maddld 5, 12, 27, 14
-; CHECK-NEXT: add 24, 22, 5
-; CHECK-NEXT: mr 5, 26
+; CHECK-NEXT: maddld 5, 11, 28, 14
+; CHECK-NEXT: mr 6, 24
+; CHECK-NEXT: mr 30, 22
+; CHECK-NEXT: mr 2, 21
+; CHECK-NEXT: mr 12, 20
+; CHECK-NEXT: mtctr 4
+; CHECK-NEXT: add 26, 23, 5
+; CHECK-NEXT: maddld 5, 11, 28, 7
+; CHECK-NEXT: add 25, 23, 5
+; CHECK-NEXT: mr 5, 27
; CHECK-NEXT: .p2align 5
; CHECK-NEXT: .LBB0_4: # %_loop_2_do_
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
@@ -213,19 +212,19 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %.
; CHECK-NEXT: lxvp 34, 0(6)
; CHECK-NEXT: lxvp 44, 0(5)
; CHECK-NEXT: xvmaddadp 39, 45, 35
-; CHECK-NEXT: lxvp 46, 0(24)
+; CHECK-NEXT: lxvp 46, 0(25)
; CHECK-NEXT: xvmaddadp 38, 47, 35
-; CHECK-NEXT: lxvp 48, 0(25)
-; CHECK-NEXT: lxvp 50, 0(29)
-; CHECK-NEXT: lxvp 62, 0(30)
-; CHECK-NEXT: lxvp 60, 0(2)
+; CHECK-NEXT: lxvp 48, 0(26)
+; CHECK-NEXT: lxvp 50, 0(30)
+; CHECK-NEXT: lxvp 62, 0(2)
+; CHECK-NEXT: lxvp 60, 0(12)
; CHECK-NEXT: lxvp 58, 32(6)
; CHECK-NEXT: lxvp 56, 32(5)
-; CHECK-NEXT: lxvp 54, 32(24)
-; CHECK-NEXT: lxvp 52, 32(25)
-; CHECK-NEXT: lxvp 30, 32(29)
-; CHECK-NEXT: lxvp 28, 32(30)
-; CHECK-NEXT: lxvp 26, 32(2)
+; CHECK-NEXT: lxvp 54, 32(25)
+; CHECK-NEXT: lxvp 52, 32(26)
+; CHECK-NEXT: lxvp 30, 32(30)
+; CHECK-NEXT: lxvp 28, 32(2)
+; CHECK-NEXT: lxvp 26, 32(12)
; CHECK-NEXT: xvmaddadp 33, 49, 35
; CHECK-NEXT: xvmaddadp 32, 51, 35
; CHECK-NEXT: xvmaddadp 37, 63, 35
@@ -250,29 +249,29 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %.
; CHECK-NEXT: xvmaddadp 42, 26, 58
; CHECK-NEXT: addi 6, 6, 64
; CHECK-NEXT: addi 5, 5, 64
-; CHECK-NEXT: addi 24, 24, 64
; CHECK-NEXT: addi 25, 25, 64
-; CHECK-NEXT: addi 29, 29, 64
+; CHECK-NEXT: addi 26, 26, 64
; CHECK-NEXT: addi 30, 30, 64
; CHECK-NEXT: addi 2, 2, 64
+; CHECK-NEXT: addi 12, 12, 64
; CHECK-NEXT: bdnz .LBB0_4
; CHECK-NEXT: # %bb.5: # %_loop_2_endl_
; CHECK-NEXT: #
-; CHECK-NEXT: addi 28, 28, 6
-; CHECK-NEXT: add 26, 26, 12
-; CHECK-NEXT: add 31, 31, 12
-; CHECK-NEXT: add 19, 19, 12
-; CHECK-NEXT: add 3, 3, 12
-; CHECK-NEXT: add 20, 20, 12
-; CHECK-NEXT: add 21, 21, 12
-; CHECK-NEXT: addi 27, 27, 1
-; CHECK-NEXT: cmpld 28, 4
+; CHECK-NEXT: addi 29, 29, 6
+; CHECK-NEXT: add 27, 27, 11
+; CHECK-NEXT: add 31, 31, 11
+; CHECK-NEXT: add 20, 20, 11
+; CHECK-NEXT: add 3, 3, 11
+; CHECK-NEXT: add 21, 21, 11
+; CHECK-NEXT: add 22, 22, 11
+; CHECK-NEXT: addi 28, 28, 1
+; CHECK-NEXT: cmpld 29, 0
; CHECK-NEXT: ble 0, .LBB0_3
; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit
-; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload
; CHECK-NEXT: lxv 63, 384(1) # 16-byte Folded Reload
; CHECK-NEXT: stxv 39, 0(3)
-; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload
; CHECK-NEXT: lxv 62, 368(1) # 16-byte Folded Reload
; CHECK-NEXT: lxv 61, 352(1) # 16-byte Folded Reload
; CHECK-NEXT: lxv 60, 336(1) # 16-byte Folded Reload
@@ -285,7 +284,7 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %.
; CHECK-NEXT: lxv 53, 224(1) # 16-byte Folded Reload
; CHECK-NEXT: lxv 52, 208(1) # 16-byte Folded Reload
; CHECK-NEXT: stxv 38, 0(3)
-; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload
; CHECK-NEXT: lfd 31, 584(1) # 8-byte Folded Reload
; CHECK-NEXT: lfd 30, 576(1) # 8-byte Folded Reload
; CHECK-NEXT: lfd 29, 568(1) # 8-byte Folded Reload
@@ -298,7 +297,10 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %.
; CHECK-NEXT: ld 28, 512(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 27, 504(1) # 8-byte Folded Reload
; CHECK-NEXT: stxv 33, 0(3)
-; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload
+; CHECK-NEXT: stxv 32, 0(10)
+; CHECK-NEXT: stxv 37, 0(9)
+; CHECK-NEXT: stxv 36, 0(8)
; CHECK-NEXT: ld 26, 496(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 25, 488(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 24, 480(1) # 8-byte Folded Reload
@@ -310,11 +312,8 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %.
; CHECK-NEXT: ld 18, 432(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 17, 424(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 16, 416(1) # 8-byte Folded Reload
-; CHECK-NEXT: stxv 32, 0(3)
-; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload
-; CHECK-NEXT: stxv 37, 0(10)
-; CHECK-NEXT: stxv 36, 0(9)
-; CHECK-NEXT: stxv 13, 0(8)
+; CHECK-NEXT: stxv 13, 0(3)
+; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 15, 408(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 14, 400(1) # 8-byte Folded Reload
; CHECK-NEXT: stxv 12, 0(3)
diff --git a/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll b/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll
index 799ba63a4df274..07a85cd19973e5 100644
--- a/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll
+++ b/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll
@@ -15,37 +15,40 @@ define signext i32 @test(ptr noalias %PtrA, ptr noalias %PtrB, i32 signext %LenA
; CHECK-NEXT: addi 8, 8, 1
; CHECK-NEXT: extsw 7, 7
; CHECK-NEXT: cmpw 8, 5
-; CHECK-NEXT: sldi 10, 7, 2
-; CHECK-NEXT: sldi 9, 9, 2
+; CHECK-NEXT: sldi 11, 7, 2
+; CHECK-NEXT: sldi 10, 9, 2
; CHECK-NEXT: addi 7, 7, 1
-; CHECK-NEXT: add 10, 4, 10
+; CHECK-NEXT: add 11, 4, 11
; CHECK-NEXT: crnot 20, 0
; CHECK-NEXT: bc 12, 20, .LBB0_5
; CHECK-NEXT: .p2align 5
; CHECK-NEXT: .LBB0_2: # %if.end
; CHECK-NEXT: # Parent Loop BB0_1 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-NEXT: lwz 12, 4(10)
-; CHECK-NEXT: addi 11, 10, 4
-; CHECK-NEXT: cmplwi 12, 0
+; CHECK-NEXT: lwz 0, 4(11)
+; CHECK-NEXT: addi 12, 11, 4
+; CHECK-NEXT: cmplwi 0, 0
; CHECK-NEXT: beq 0, .LBB0_4
; CHECK-NEXT: # %bb.3: # %if.then4
; CHECK-NEXT: #
-; CHECK-NEXT: lwzx 12, 6, 9
+; CHECK-NEXT: lwzx 0, 6, 10
; CHECK-NEXT: addi 7, 7, 1
-; CHECK-NEXT: stw 12, 8(10)
-; CHECK-NEXT: mr 10, 11
+; CHECK-NEXT: stw 0, 8(11)
+; CHECK-NEXT: mr 11, 12
; CHECK-NEXT: bc 4, 20, .LBB0_2
; CHECK-NEXT: b .LBB0_5
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_4: # %if.end9
; CHECK-NEXT: #
-; CHECK-NEXT: lwzx 10, 6, 9
-; CHECK-NEXT: addi 10, 10, 1
-; CHECK-NEXT: stwx 10, 6, 9
+; CHECK-NEXT: lwzx 9, 6, 10
+; CHECK-NEXT: addi 9, 9, 1
+; CHECK-NEXT: stwx 9, 6, 10
; CHECK-NEXT: b .LBB0_1
; CHECK-NEXT: .LBB0_5: # %if.then
-; CHECK-NEXT: lwax 3, 9, 3
+; CHECK-NEXT: addi 4, 9, -1
+; CHECK-NEXT: sldi 4, 4, 2
+; CHECK-NEXT: add 3, 3, 4
+; CHECK-NEXT: lwa 3, 4(3)
; CHECK-NEXT: blr
entry:
br label %block2
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 7a6640fea2d1e4..7e11274a31c19f 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -180,19 +180,25 @@ entry:
define <8 x i16> @sub_absv_16_ext(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr {
; CHECK-PWR9-LABEL: sub_absv_16_ext:
; CHECK-PWR9: # %bb.0: # %entry
-; CHECK-PWR9-NEXT: vmrghh v4, v2, v2
+; CHECK-PWR9-NEXT: vspltisw v4, 8
+; CHECK-PWR9-NEXT: vmrghh v5, v2, v2
; CHECK-PWR9-NEXT: vmrglh v2, v2, v2
-; CHECK-PWR9-NEXT: vmrghh v5, v3, v3
+; CHECK-PWR9-NEXT: vmrghh v0, v3, v3
; CHECK-PWR9-NEXT: vmrglh v3, v3, v3
-; CHECK-PWR9-NEXT: vextsh2w v2, v2
-; CHECK-PWR9-NEXT: vextsh2w v3, v3
-; CHECK-PWR9-NEXT: vextsh2w v4, v4
-; CHECK-PWR9-NEXT: vextsh2w v5, v5
+; CHECK-PWR9-NEXT: vadduwm v4, v4, v4
+; CHECK-PWR9-NEXT: vslw v2, v2, v4
+; CHECK-PWR9-NEXT: vslw v3, v3, v4
+; CHECK-PWR9-NEXT: vslw v5, v5, v4
+; CHECK-PWR9-NEXT: vslw v0, v0, v4
+; CHECK-PWR9-NEXT: vsraw v2, v2, v4
+; CHECK-PWR9-NEXT: vsraw v3, v3, v4
+; CHECK-PWR9-NEXT: vsraw v5, v5, v4
+; CHECK-PWR9-NEXT: vsraw v0, v0, v4
; CHECK-PWR9-NEXT: xvnegsp v3, v3
; CHECK-PWR9-NEXT: xvnegsp v2, v2
-; CHECK-PWR9-NEXT: xvnegsp v4, v4
+; CHECK-PWR9-NEXT: xvnegsp v4, v5
; CHECK-PWR9-NEXT: vabsduw v2, v2, v3
-; CHECK-PWR9-NEXT: xvnegsp v3, v5
+; CHECK-PWR9-NEXT: xvnegsp v3, v0
; CHECK-PWR9-NEXT: vabsduw v3, v4, v3
; CHECK-PWR9-NEXT: vpkuwum v2, v3, v2
; CHECK-PWR9-NEXT: blr
@@ -1361,10 +1367,14 @@ define <16 x i8> @zext_sub_absd8(<16 x i4>, <16 x i4>) local_unnamed_addr {
define <4 x i32> @sext_sub_absd32(<4 x i16>, <4 x i16>) local_unnamed_addr {
; CHECK-PWR9-LE-LABEL: sext_sub_absd32:
; CHECK-PWR9-LE: # %bb.0:
+; CHECK-PWR9-LE-NEXT: vspltisw v4, 8
; CHECK-PWR9-LE-NEXT: vmrglh v2, v2, v2
; CHECK-PWR9-LE-NEXT: vmrglh v3, v3, v3
-; CHECK-PWR9-LE-NEXT: vextsh2w v2, v2
-; CHECK-PWR9-LE-NEXT: vextsh2w v3, v3
+; CHECK-PWR9-LE-NEXT: vadduwm v4, v4, v4
+; CHECK-PWR9-LE-NEXT: vslw v2, v2, v4
+; CHECK-PWR9-LE-NEXT: vslw v3, v3, v4
+; CHECK-PWR9-LE-NEXT: vsraw v2, v2, v4
+; CHECK-PWR9-LE-NEXT: vsraw v3, v3, v4
; CHECK-PWR9-LE-NEXT: xvnegsp v3, v3
; CHECK-PWR9-LE-NEXT: xvnegsp v2, v2
; CHECK-PWR9-LE-NEXT: vabsduw v2, v2, v3
@@ -1372,10 +1382,14 @@ define <4 x i32> @sext_sub_absd32(<4 x i16>, <4 x i16>) local_unnamed_addr {
;
; CHECK-PWR9-BE-LABEL: sext_sub_absd32:
; CHECK-PWR9-BE: # %bb.0:
+; CHECK-PWR9-BE-NEXT: vspltisw v4, 8
; CHECK-PWR9-BE-NEXT: vmrghh v2, v2, v2
; CHECK-PWR9-BE-NEXT: vmrghh v3, v3, v3
-; CHECK-PWR9-BE-NEXT: vextsh2w v2, v2
-; CHECK-PWR9-BE-NEXT: vextsh2w v3, v3
+; CHECK-PWR9-BE-NEXT: vadduwm v4, v4, v4
+; CHECK-PWR9-BE-NEXT: vslw v2, v2, v4
+; CHECK-PWR9-BE-NEXT: vslw v3, v3, v4
+; CHECK-PWR9-BE-NEXT: vsraw v2, v2, v4
+; CHECK-PWR9-BE-NEXT: vsraw v3, v3, v4
; CHECK-PWR9-BE-NEXT: xvnegsp v3, v3
; CHECK-PWR9-BE-NEXT: xvnegsp v2, v2
; CHECK-PWR9-BE-NEXT: vabsduw v2, v2, v3
diff --git a/llvm/test/CodeGen/PowerPC/pr38087.ll b/llvm/test/CodeGen/PowerPC/pr38087.ll
index 1216fa9cf8f260..418fd72f108ea7 100644
--- a/llvm/test/CodeGen/PowerPC/pr38087.ll
+++ b/llvm/test/CodeGen/PowerPC/pr38087.ll
@@ -12,8 +12,11 @@ define void @draw_llvm_vs_variant0(<4 x float> %x) {
; CHECK-LABEL: draw_llvm_vs_variant0:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxsd v3, 0(r3)
+; CHECK-NEXT: vspltisw v4, 8
+; CHECK-NEXT: vadduwm v4, v4, v4
; CHECK-NEXT: vmrghh v3, v3, v3
-; CHECK-NEXT: vextsh2w v3, v3
+; CHECK-NEXT: vslw v3, v3, v4
+; CHECK-NEXT: vsraw v3, v3, v4
; CHECK-NEXT: xvcvsxwsp vs0, v3
; CHECK-NEXT: xxspltw vs0, vs0, 2
; CHECK-NEXT: xvmaddasp vs0, v2, v2
diff --git a/llvm/test/CodeGen/PowerPC/pr45432.ll b/llvm/test/CodeGen/PowerPC/pr45432.ll
index 83b5390b6bf148..e79a8f46aafcd9 100644
--- a/llvm/test/CodeGen/PowerPC/pr45432.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45432.ll
@@ -15,8 +15,8 @@ define dso_local void @h() local_unnamed_addr #0 {
; CHECK-NEXT: addis 3, 2, g at toc@ha
; CHECK-NEXT: std 0, 80(1)
; CHECK-NEXT: std 30, 48(1) # 8-byte Folded Spill
-; CHECK-NEXT: lwz 3, g at toc@l(3)
-; CHECK-NEXT: extswsli 30, 3, 2
+; CHECK-NEXT: lwa 3, g at toc@l(3)
+; CHECK-NEXT: sldi 30, 3, 2
; CHECK-NEXT: addis 3, 2, f at got@tlsld at ha
; CHECK-NEXT: addi 3, 3, f at got@tlsld at l
; CHECK-NEXT: bl __tls_get_addr(f at tlsld)
diff --git a/llvm/test/CodeGen/PowerPC/rlwinm.ll b/llvm/test/CodeGen/PowerPC/rlwinm.ll
index 363eb171276566..7ba5c90b29c7a7 100644
--- a/llvm/test/CodeGen/PowerPC/rlwinm.ll
+++ b/llvm/test/CodeGen/PowerPC/rlwinm.ll
@@ -47,7 +47,8 @@ entry:
define i32 @test5(i32 %a) {
; CHECK-LABEL: test5:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: rlwinm 3, 3, 24, 24, 31
+; CHECK-NEXT: rlwinm 3, 3, 0, 16, 23
+; CHECK-NEXT: srawi 3, 3, 8
; CHECK-NEXT: blr
entry:
%tmp.1 = and i32 %a, 65280
diff --git a/llvm/test/CodeGen/PowerPC/sext-vector-inreg.ll b/llvm/test/CodeGen/PowerPC/sext-vector-inreg.ll
index 0725eb27b57b02..be6c9c9e6d3e8a 100644
--- a/llvm/test/CodeGen/PowerPC/sext-vector-inreg.ll
+++ b/llvm/test/CodeGen/PowerPC/sext-vector-inreg.ll
@@ -4,8 +4,11 @@
define <4 x i32> @test_signext_vector_inreg(<4 x i16> %n) {
; CHECK-P9-LABEL: test_signext_vector_inreg:
; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: vspltisw 3, 8
; CHECK-P9-NEXT: vmrglh 2, 2, 2
-; CHECK-P9-NEXT: vextsh2w 2, 2
+; CHECK-P9-NEXT: vadduwm 3, 3, 3
+; CHECK-P9-NEXT: vslw 2, 2, 3
+; CHECK-P9-NEXT: vsraw 2, 2, 3
; CHECK-P9-NEXT: blr
;
; CHECK-P8-LABEL: test_signext_vector_inreg:
diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll
index 628822edabf392..c5ccaaab4f28d3 100644
--- a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll
@@ -19,39 +19,27 @@ define void @phi3(ptr) nounwind {
; CHECK-NEXT: mr 29, 3
; CHECK-NEXT: bl malloc
; CHECK-NEXT: nop
-; CHECK-NEXT: addi 7, 30, -4
+; CHECK-NEXT: addi 5, 30, -4
+; CHECK-NEXT: li 6, 0
; CHECK-NEXT: mtctr 3
; CHECK-NEXT: addi 4, 29, -8
-; CHECK-NEXT: li 5, 0
-; CHECK-NEXT: lwzu 8, 4(7)
-; CHECK-NEXT: bdz .LBB0_5
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: extswsli 6, 5, 5
-; CHECK-NEXT: add 5, 8, 5
-; CHECK-NEXT: lwzu 8, 4(7)
-; CHECK-NEXT: bdz .LBB0_4
-; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: lwzu 7, 4(5)
+; CHECK-NEXT: extsw 8, 6
+; CHECK-NEXT: sldi 6, 8, 5
; CHECK-NEXT: add 6, 3, 6
-; CHECK-NEXT: stdu 6, 8(4)
-; CHECK-NEXT: extswsli 6, 5, 5
-; CHECK-NEXT: add 5, 8, 5
-; CHECK-NEXT: lwzu 8, 4(7)
-; CHECK-NEXT: bdz .LBB0_4
+; CHECK-NEXT: add 7, 7, 8
+; CHECK-NEXT: bdz .LBB0_2
; CHECK-NEXT: .p2align 5
-; CHECK-NEXT: .LBB0_3:
-; CHECK-NEXT: add 9, 3, 6
-; CHECK-NEXT: extswsli 6, 5, 5
-; CHECK-NEXT: add 5, 8, 5
-; CHECK-NEXT: lwzu 8, 4(7)
-; CHECK-NEXT: stdu 9, 8(4)
-; CHECK-NEXT: bdnz .LBB0_3
-; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: extsw 7, 7
+; CHECK-NEXT: lwzu 8, 4(5)
+; CHECK-NEXT: stdu 6, 8(4)
+; CHECK-NEXT: sldi 6, 7, 5
+; CHECK-NEXT: add 7, 8, 7
; CHECK-NEXT: add 6, 3, 6
+; CHECK-NEXT: bdnz .LBB0_1
+; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: stdu 6, 8(4)
-; CHECK-NEXT: .LBB0_5:
-; CHECK-NEXT: extswsli 5, 5, 5
-; CHECK-NEXT: add 3, 3, 5
-; CHECK-NEXT: stdu 3, 8(4)
; CHECK-NEXT: addi 1, 1, 64
; CHECK-NEXT: ld 0, 16(1)
; CHECK-NEXT: ld 30, -16(1) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
index b0cc89d1828eda..e21e0d2a647608 100644
--- a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
@@ -88,7 +88,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; PPC-NEXT: srawi 4, 4, 26
; PPC-NEXT: rlwinm 4, 4, 23, 30, 31
; PPC-NEXT: add 4, 3, 4
-; PPC-NEXT: rlwinm 4, 4, 0, 26, 29
+; PPC-NEXT: rlwinm 4, 4, 0, 0, 29
; PPC-NEXT: sub 3, 3, 4
; PPC-NEXT: clrlwi 3, 3, 26
; PPC-NEXT: cntlzw 3, 3
@@ -102,7 +102,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; PPC64LE-NEXT: srawi 4, 4, 26
; PPC64LE-NEXT: rlwinm 4, 4, 23, 30, 31
; PPC64LE-NEXT: add 4, 3, 4
-; PPC64LE-NEXT: rlwinm 4, 4, 0, 26, 29
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 0, 29
; PPC64LE-NEXT: sub 3, 3, 4
; PPC64LE-NEXT: clrlwi 3, 3, 26
; PPC64LE-NEXT: cntlzw 3, 3
diff --git a/llvm/test/CodeGen/PowerPC/vec-itofp.ll b/llvm/test/CodeGen/PowerPC/vec-itofp.ll
index 37a1e46927b1e1..fd01caf2f93a35 100644
--- a/llvm/test/CodeGen/PowerPC/vec-itofp.ll
+++ b/llvm/test/CodeGen/PowerPC/vec-itofp.ll
@@ -292,27 +292,34 @@ define void @stest8(ptr nocapture %Sink, ptr nocapture readonly %SrcPtr) {
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1 at toc@l
+; CHECK-P9-NEXT: lxv v4, 0(r4)
+; CHECK-P9-NEXT: addis r4, r2, .LCPI3_2 at toc@ha
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
-; CHECK-P9-NEXT: vextsh2d v3, v3
+; CHECK-P9-NEXT: addi r4, r4, .LCPI3_2 at toc@l
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI3_2 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI3_2 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI3_3 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI3_3 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: vextsh2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs1, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI3_3 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI3_3 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI3_4 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI3_4 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
-; CHECK-P9-NEXT: vextsh2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs2, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
-; CHECK-P9-NEXT: vextsh2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v4
+; CHECK-P9-NEXT: vsrad v2, v2, v4
; CHECK-P9-NEXT: xvcvsxddp vs3, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
@@ -395,13 +402,18 @@ define void @stest4(ptr nocapture %Sink, ptr nocapture readonly %SrcPtr) {
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI4_1 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI4_1 at toc@l
+; CHECK-P9-NEXT: lxv v4, 0(r4)
+; CHECK-P9-NEXT: addis r4, r2, .LCPI4_2 at toc@ha
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
-; CHECK-P9-NEXT: vextsh2d v3, v3
+; CHECK-P9-NEXT: addi r4, r4, .LCPI4_2 at toc@l
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: vextsh2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v4
+; CHECK-P9-NEXT: vsrad v2, v2, v4
; CHECK-P9-NEXT: xvcvsxddp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: blr
@@ -457,8 +469,12 @@ define void @stest2(ptr nocapture %Sink, ptr nocapture readonly %SrcPtr) {
; CHECK-P9-NEXT: addis r4, r2, .LCPI5_0 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI5_0 at toc@l
; CHECK-P9-NEXT: lxv vs0, 0(r4)
+; CHECK-P9-NEXT: addis r4, r2, .LCPI5_1 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI5_1 at toc@l
+; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: xxperm v2, v2, vs0
-; CHECK-P9-NEXT: vextsh2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v3
+; CHECK-P9-NEXT: vsrad v2, v2, v3
; CHECK-P9-NEXT: xvcvsxddp vs0, v2
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll
index f52a92596dec82..cb2d168d2f97ab 100644
--- a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll
@@ -297,8 +297,11 @@ define <4 x float> @test4elt_signed(i64 %a.coerce) local_unnamed_addr #1 {
; CHECK-P9-LABEL: test4elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd v2, r3
+; CHECK-P9-NEXT: vspltisw v3, 8
; CHECK-P9-NEXT: vmrghh v2, v2, v2
-; CHECK-P9-NEXT: vextsh2w v2, v2
+; CHECK-P9-NEXT: vadduwm v3, v3, v3
+; CHECK-P9-NEXT: vslw v2, v2, v3
+; CHECK-P9-NEXT: vsraw v2, v2, v3
; CHECK-P9-NEXT: xvcvsxwsp v2, v2
; CHECK-P9-NEXT: blr
;
@@ -337,11 +340,15 @@ define void @test8elt_signed(ptr noalias nocapture sret(<8 x float>) %agg.result
;
; CHECK-P9-LABEL: test8elt_signed:
; CHECK-P9: # %bb.0: # %entry
-; CHECK-P9-NEXT: vmrglh v3, v2, v2
+; CHECK-P9-NEXT: vspltisw v3, 8
+; CHECK-P9-NEXT: vmrglh v4, v2, v2
; CHECK-P9-NEXT: vmrghh v2, v2, v2
-; CHECK-P9-NEXT: vextsh2w v3, v3
-; CHECK-P9-NEXT: vextsh2w v2, v2
-; CHECK-P9-NEXT: xvcvsxwsp vs0, v3
+; CHECK-P9-NEXT: vadduwm v3, v3, v3
+; CHECK-P9-NEXT: vslw v4, v4, v3
+; CHECK-P9-NEXT: vslw v2, v2, v3
+; CHECK-P9-NEXT: vsraw v4, v4, v3
+; CHECK-P9-NEXT: vsraw v2, v2, v3
+; CHECK-P9-NEXT: xvcvsxwsp vs0, v4
; CHECK-P9-NEXT: xvcvsxwsp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: stxv vs0, 0(r3)
@@ -404,24 +411,30 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x float>) %agg.resu
;
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
-; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: lxv v2, 16(r4)
-; CHECK-P9-NEXT: vmrglh v4, v3, v3
-; CHECK-P9-NEXT: vmrghh v3, v3, v3
-; CHECK-P9-NEXT: vextsh2w v3, v3
-; CHECK-P9-NEXT: vextsh2w v4, v4
-; CHECK-P9-NEXT: xvcvsxwsp vs1, v3
-; CHECK-P9-NEXT: vmrglh v3, v2, v2
+; CHECK-P9-NEXT: lxv v2, 0(r4)
+; CHECK-P9-NEXT: vspltisw v3, 8
+; CHECK-P9-NEXT: vadduwm v3, v3, v3
+; CHECK-P9-NEXT: vmrglh v4, v2, v2
; CHECK-P9-NEXT: vmrghh v2, v2, v2
+; CHECK-P9-NEXT: vslw v4, v4, v3
+; CHECK-P9-NEXT: vslw v2, v2, v3
+; CHECK-P9-NEXT: vsraw v4, v4, v3
+; CHECK-P9-NEXT: vsraw v2, v2, v3
; CHECK-P9-NEXT: xvcvsxwsp vs0, v4
-; CHECK-P9-NEXT: vextsh2w v3, v3
-; CHECK-P9-NEXT: vextsh2w v2, v2
-; CHECK-P9-NEXT: xvcvsxwsp vs2, v3
-; CHECK-P9-NEXT: xvcvsxwsp vs3, v2
+; CHECK-P9-NEXT: lxv v4, 16(r4)
+; CHECK-P9-NEXT: xvcvsxwsp vs1, v2
+; CHECK-P9-NEXT: vmrglh v2, v4, v4
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: stxv vs3, 48(r3)
+; CHECK-P9-NEXT: vslw v2, v2, v3
+; CHECK-P9-NEXT: vsraw v2, v2, v3
+; CHECK-P9-NEXT: xvcvsxwsp vs2, v2
+; CHECK-P9-NEXT: vmrghh v2, v4, v4
+; CHECK-P9-NEXT: vslw v2, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
+; CHECK-P9-NEXT: vsraw v2, v2, v3
+; CHECK-P9-NEXT: xvcvsxwsp vs3, v2
+; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test16elt_signed:
diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll
index bfb8b72327f5a6..7f61d0adfb3ed9 100644
--- a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll
@@ -399,8 +399,12 @@ define <2 x double> @test2elt_signed(i32 %a.coerce) local_unnamed_addr #0 {
; CHECK-P9-NEXT: addis r3, r2, .LCPI4_0 at toc@ha
; CHECK-P9-NEXT: addi r3, r3, .LCPI4_0 at toc@l
; CHECK-P9-NEXT: lxv vs0, 0(r3)
+; CHECK-P9-NEXT: addis r3, r2, .LCPI4_1 at toc@ha
+; CHECK-P9-NEXT: addi r3, r3, .LCPI4_1 at toc@l
+; CHECK-P9-NEXT: lxv v3, 0(r3)
; CHECK-P9-NEXT: xxperm v2, v2, vs0
-; CHECK-P9-NEXT: vextsh2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v3
+; CHECK-P9-NEXT: vsrad v2, v2, v3
; CHECK-P9-NEXT: xvcvsxddp v2, v2
; CHECK-P9-NEXT: blr
;
@@ -458,13 +462,18 @@ define void @test4elt_signed(ptr noalias nocapture sret(<4 x double>) %agg.resul
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI5_1 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI5_1 at toc@l
+; CHECK-P9-NEXT: lxv v4, 0(r4)
+; CHECK-P9-NEXT: addis r4, r2, .LCPI5_2 at toc@ha
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
-; CHECK-P9-NEXT: vextsh2d v3, v3
+; CHECK-P9-NEXT: addi r4, r4, .LCPI5_2 at toc@l
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: vextsh2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v4
+; CHECK-P9-NEXT: vsrad v2, v2, v4
; CHECK-P9-NEXT: xvcvsxddp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: blr
@@ -552,27 +561,34 @@ define void @test8elt_signed(ptr noalias nocapture sret(<8 x double>) %agg.resul
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_1 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_1 at toc@l
+; CHECK-P9-NEXT: lxv v4, 0(r4)
+; CHECK-P9-NEXT: addis r4, r2, .LCPI6_2 at toc@ha
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
-; CHECK-P9-NEXT: vextsh2d v3, v3
+; CHECK-P9-NEXT: addi r4, r4, .LCPI6_2 at toc@l
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI6_2 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI6_2 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI6_3 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI6_3 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: vextsh2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs1, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI6_3 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI6_3 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI6_4 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI6_4 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
-; CHECK-P9-NEXT: vextsh2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs2, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
-; CHECK-P9-NEXT: vextsh2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v4
+; CHECK-P9-NEXT: vsrad v2, v2, v4
; CHECK-P9-NEXT: xvcvsxddp vs3, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
@@ -700,48 +716,59 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x double>) %agg.res
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r5, r2, .LCPI7_0 at toc@ha
-; CHECK-P9-NEXT: lxv v2, 0(r4)
+; CHECK-P9-NEXT: lxv v4, 0(r4)
; CHECK-P9-NEXT: addi r5, r5, .LCPI7_0 at toc@l
-; CHECK-P9-NEXT: lxv v3, 0(r5)
+; CHECK-P9-NEXT: lxv v2, 0(r5)
; CHECK-P9-NEXT: addis r5, r2, .LCPI7_1 at toc@ha
; CHECK-P9-NEXT: addi r5, r5, .LCPI7_1 at toc@l
-; CHECK-P9-NEXT: lxv v5, 0(r5)
+; CHECK-P9-NEXT: lxv v3, 0(r5)
; CHECK-P9-NEXT: addis r5, r2, .LCPI7_2 at toc@ha
-; CHECK-P9-NEXT: vperm v4, v2, v2, v3
+; CHECK-P9-NEXT: vperm v0, v4, v4, v2
; CHECK-P9-NEXT: addi r5, r5, .LCPI7_2 at toc@l
-; CHECK-P9-NEXT: vextsh2d v4, v4
-; CHECK-P9-NEXT: lxv v0, 0(r5)
+; CHECK-P9-NEXT: lxv v5, 0(r5)
; CHECK-P9-NEXT: addis r5, r2, .LCPI7_3 at toc@ha
-; CHECK-P9-NEXT: xvcvsxddp vs0, v4
-; CHECK-P9-NEXT: vperm v4, v2, v2, v5
+; CHECK-P9-NEXT: vsld v0, v0, v3
; CHECK-P9-NEXT: addi r5, r5, .LCPI7_3 at toc@l
-; CHECK-P9-NEXT: lxv v1, 0(r5)
-; CHECK-P9-NEXT: vextsh2d v4, v4
-; CHECK-P9-NEXT: xvcvsxddp vs1, v4
-; CHECK-P9-NEXT: vperm v4, v2, v2, v0
-; CHECK-P9-NEXT: vperm v2, v2, v2, v1
+; CHECK-P9-NEXT: vperm v1, v4, v4, v5
+; CHECK-P9-NEXT: vsrad v0, v0, v3
+; CHECK-P9-NEXT: xvcvsxddp vs0, v0
+; CHECK-P9-NEXT: vsld v1, v1, v3
+; CHECK-P9-NEXT: vsrad v0, v1, v3
+; CHECK-P9-NEXT: xvcvsxddp vs1, v0
+; CHECK-P9-NEXT: lxv v0, 0(r5)
+; CHECK-P9-NEXT: addis r5, r2, .LCPI7_4 at toc@ha
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: vextsh2d v4, v4
-; CHECK-P9-NEXT: xvcvsxddp vs2, v4
-; CHECK-P9-NEXT: lxv v4, 16(r4)
+; CHECK-P9-NEXT: addi r5, r5, .LCPI7_4 at toc@l
+; CHECK-P9-NEXT: vperm v1, v4, v4, v0
; CHECK-P9-NEXT: stxv vs1, 16(r3)
-; CHECK-P9-NEXT: vextsh2d v2, v2
-; CHECK-P9-NEXT: xvcvsxddp vs3, v2
-; CHECK-P9-NEXT: vperm v2, v4, v4, v3
+; CHECK-P9-NEXT: vsld v1, v1, v3
+; CHECK-P9-NEXT: vsrad v1, v1, v3
+; CHECK-P9-NEXT: xvcvsxddp vs2, v1
+; CHECK-P9-NEXT: lxv v1, 0(r5)
+; CHECK-P9-NEXT: vperm v4, v4, v4, v1
; CHECK-P9-NEXT: stxv vs2, 32(r3)
-; CHECK-P9-NEXT: vextsh2d v2, v2
+; CHECK-P9-NEXT: vsld v4, v4, v3
+; CHECK-P9-NEXT: vsrad v4, v4, v3
+; CHECK-P9-NEXT: xvcvsxddp vs3, v4
+; CHECK-P9-NEXT: lxv v4, 16(r4)
+; CHECK-P9-NEXT: vperm v2, v4, v4, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
+; CHECK-P9-NEXT: vsld v2, v2, v3
+; CHECK-P9-NEXT: vsrad v2, v2, v3
; CHECK-P9-NEXT: xvcvsxddp vs4, v2
; CHECK-P9-NEXT: vperm v2, v4, v4, v5
-; CHECK-P9-NEXT: vextsh2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v3
+; CHECK-P9-NEXT: vsrad v2, v2, v3
; CHECK-P9-NEXT: xvcvsxddp vs5, v2
; CHECK-P9-NEXT: vperm v2, v4, v4, v0
; CHECK-P9-NEXT: stxv vs4, 64(r3)
-; CHECK-P9-NEXT: vextsh2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v3
+; CHECK-P9-NEXT: vsrad v2, v2, v3
; CHECK-P9-NEXT: xvcvsxddp vs6, v2
; CHECK-P9-NEXT: vperm v2, v4, v4, v1
; CHECK-P9-NEXT: stxv vs5, 80(r3)
-; CHECK-P9-NEXT: vextsh2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v3
+; CHECK-P9-NEXT: vsrad v2, v2, v3
; CHECK-P9-NEXT: xvcvsxddp vs7, v2
; CHECK-P9-NEXT: stxv vs6, 96(r3)
; CHECK-P9-NEXT: stxv vs7, 112(r3)
diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
index 1ff1f6b7bc4e83..5f7cbd87937a0f 100644
--- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
@@ -351,10 +351,13 @@ define <4 x float> @test4elt_signed(i32 %a.coerce) local_unnamed_addr #1 {
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrwz v2, r3
; CHECK-P9-NEXT: addis r3, r2, .LCPI5_0 at toc@ha
+; CHECK-P9-NEXT: vspltisw v3, 12
; CHECK-P9-NEXT: addi r3, r3, .LCPI5_0 at toc@l
+; CHECK-P9-NEXT: vadduwm v3, v3, v3
; CHECK-P9-NEXT: lxv vs0, 0(r3)
; CHECK-P9-NEXT: xxperm v2, v2, vs0
-; CHECK-P9-NEXT: vextsb2w v2, v2
+; CHECK-P9-NEXT: vslw v2, v2, v3
+; CHECK-P9-NEXT: vsraw v2, v2, v3
; CHECK-P9-NEXT: xvcvsxwsp v2, v2
; CHECK-P9-NEXT: blr
;
@@ -407,17 +410,21 @@ define void @test8elt_signed(ptr noalias nocapture sret(<8 x float>) %agg.result
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd v2, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_0 at toc@ha
+; CHECK-P9-NEXT: vspltisw v4, 12
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_0 at toc@l
+; CHECK-P9-NEXT: vadduwm v4, v4, v4
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_1 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_1 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
-; CHECK-P9-NEXT: vextsb2w v3, v3
+; CHECK-P9-NEXT: vslw v3, v3, v4
+; CHECK-P9-NEXT: vsraw v3, v3, v4
; CHECK-P9-NEXT: xvcvsxwsp vs0, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: vextsb2w v2, v2
+; CHECK-P9-NEXT: vslw v2, v2, v4
+; CHECK-P9-NEXT: vsraw v2, v2, v4
; CHECK-P9-NEXT: xvcvsxwsp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: blr
@@ -500,31 +507,37 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x float>) %agg.resu
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_0 at toc@ha
+; CHECK-P9-NEXT: vspltisw v4, 12
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_0 at toc@l
+; CHECK-P9-NEXT: vadduwm v4, v4, v4
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_1 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_1 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
-; CHECK-P9-NEXT: vextsb2w v3, v3
+; CHECK-P9-NEXT: vslw v3, v3, v4
+; CHECK-P9-NEXT: vsraw v3, v3, v4
; CHECK-P9-NEXT: xvcvsxwsp vs0, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_2 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_2 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: vextsb2w v3, v3
+; CHECK-P9-NEXT: vslw v3, v3, v4
+; CHECK-P9-NEXT: vsraw v3, v3, v4
; CHECK-P9-NEXT: xvcvsxwsp vs1, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_3 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_3 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
-; CHECK-P9-NEXT: vextsb2w v3, v3
+; CHECK-P9-NEXT: vslw v3, v3, v4
+; CHECK-P9-NEXT: vsraw v3, v3, v4
; CHECK-P9-NEXT: xvcvsxwsp vs2, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
-; CHECK-P9-NEXT: vextsb2w v2, v2
+; CHECK-P9-NEXT: vslw v2, v2, v4
+; CHECK-P9-NEXT: vsraw v2, v2, v4
; CHECK-P9-NEXT: xvcvsxwsp vs3, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
index af3132f88f001a..4332f69774b51e 100644
--- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
@@ -434,8 +434,12 @@ define <2 x double> @test2elt_signed(i16 %a.coerce) local_unnamed_addr #0 {
; CHECK-P9-NEXT: addis r3, r2, .LCPI4_0 at toc@ha
; CHECK-P9-NEXT: addi r3, r3, .LCPI4_0 at toc@l
; CHECK-P9-NEXT: lxv vs0, 0(r3)
+; CHECK-P9-NEXT: addis r3, r2, .LCPI4_1 at toc@ha
+; CHECK-P9-NEXT: addi r3, r3, .LCPI4_1 at toc@l
+; CHECK-P9-NEXT: lxv v3, 0(r3)
; CHECK-P9-NEXT: xxperm v2, v2, vs0
-; CHECK-P9-NEXT: vextsb2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v3
+; CHECK-P9-NEXT: vsrad v2, v2, v3
; CHECK-P9-NEXT: xvcvsxddp v2, v2
; CHECK-P9-NEXT: blr
;
@@ -493,13 +497,18 @@ define void @test4elt_signed(ptr noalias nocapture sret(<4 x double>) %agg.resul
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI5_1 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI5_1 at toc@l
+; CHECK-P9-NEXT: lxv v4, 0(r4)
+; CHECK-P9-NEXT: addis r4, r2, .LCPI5_2 at toc@ha
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: addi r4, r4, .LCPI5_2 at toc@l
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: vextsb2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v4
+; CHECK-P9-NEXT: vsrad v2, v2, v4
; CHECK-P9-NEXT: xvcvsxddp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: blr
@@ -589,27 +598,34 @@ define void @test8elt_signed(ptr noalias nocapture sret(<8 x double>) %agg.resul
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_1 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_1 at toc@l
+; CHECK-P9-NEXT: lxv v4, 0(r4)
+; CHECK-P9-NEXT: addis r4, r2, .LCPI6_2 at toc@ha
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: addi r4, r4, .LCPI6_2 at toc@l
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI6_2 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI6_2 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI6_3 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI6_3 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs1, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI6_3 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI6_3 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI6_4 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI6_4 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs2, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
-; CHECK-P9-NEXT: vextsb2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v4
+; CHECK-P9-NEXT: vsrad v2, v2, v4
; CHECK-P9-NEXT: xvcvsxddp vs3, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
@@ -755,55 +771,66 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x double>) %agg.res
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_1 at toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_1 at toc@l
+; CHECK-P9-NEXT: lxv v4, 0(r4)
+; CHECK-P9-NEXT: addis r4, r2, .LCPI7_2 at toc@ha
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: addi r4, r4, .LCPI7_2 at toc@l
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI7_2 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI7_2 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI7_3 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI7_3 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs1, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI7_3 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI7_3 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI7_4 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI7_4 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs2, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI7_4 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI7_4 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI7_5 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI7_5 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs3, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI7_5 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI7_5 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI7_6 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI7_6 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs3, 48(r3)
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs4, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI7_6 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI7_6 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI7_7 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI7_7 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs4, 64(r3)
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs5, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
-; CHECK-P9-NEXT: addis r4, r2, .LCPI7_7 at toc@ha
-; CHECK-P9-NEXT: addi r4, r4, .LCPI7_7 at toc@l
+; CHECK-P9-NEXT: addis r4, r2, .LCPI7_8 at toc@ha
+; CHECK-P9-NEXT: addi r4, r4, .LCPI7_8 at toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs5, 80(r3)
-; CHECK-P9-NEXT: vextsb2d v3, v3
+; CHECK-P9-NEXT: vsld v3, v3, v4
+; CHECK-P9-NEXT: vsrad v3, v3, v4
; CHECK-P9-NEXT: xvcvsxddp vs6, v3
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs6, 96(r3)
-; CHECK-P9-NEXT: vextsb2d v2, v2
+; CHECK-P9-NEXT: vsld v2, v2, v4
+; CHECK-P9-NEXT: vsrad v2, v2, v4
; CHECK-P9-NEXT: xvcvsxddp vs7, v2
; CHECK-P9-NEXT: stxv vs7, 112(r3)
; CHECK-P9-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
index 9cabe0c17d849d..5f2ff8baf8bf1b 100644
--- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
@@ -7268,8 +7268,12 @@ define <2 x double> @constrained_vector_sitofp_v2f64_v2i16(<2 x i16> %x) #0 {
; PC64LE9-NEXT: addis 3, 2, .LCPI155_0 at toc@ha
; PC64LE9-NEXT: addi 3, 3, .LCPI155_0 at toc@l
; PC64LE9-NEXT: lxv 0, 0(3)
+; PC64LE9-NEXT: addis 3, 2, .LCPI155_1 at toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI155_1 at toc@l
+; PC64LE9-NEXT: lxv 35, 0(3)
; PC64LE9-NEXT: xxperm 34, 34, 0
-; PC64LE9-NEXT: vextsh2d 2, 2
+; PC64LE9-NEXT: vsld 2, 2, 3
+; PC64LE9-NEXT: vsrad 2, 2, 3
; PC64LE9-NEXT: xvcvsxddp 34, 34
; PC64LE9-NEXT: blr
entry:
@@ -7636,8 +7640,11 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i16(<4 x i16> %x) #0 {
;
; PC64LE9-LABEL: constrained_vector_sitofp_v4f32_v4i16:
; PC64LE9: # %bb.0: # %entry
+; PC64LE9-NEXT: vspltisw 3, 8
; PC64LE9-NEXT: vmrglh 2, 2, 2
-; PC64LE9-NEXT: vextsh2w 2, 2
+; PC64LE9-NEXT: vadduwm 3, 3, 3
+; PC64LE9-NEXT: vslw 2, 2, 3
+; PC64LE9-NEXT: vsraw 2, 2, 3
; PC64LE9-NEXT: xvcvsxwsp 34, 34
; PC64LE9-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index 274f1cef49aa95..a6d4159667944c 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -26,8 +26,8 @@ define signext i32 @add_small_const(i32 signext %a) nounwind {
; RV64I-LABEL: add_small_const:
; RV64I: # %bb.0:
; RV64I-NEXT: addi a0, a0, 1
-; RV64I-NEXT: slli a0, a0, 56
-; RV64I-NEXT: srai a0, a0, 56
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: sraiw a0, a0, 24
; RV64I-NEXT: jalr zero, 0(ra)
;
; RV32C-LABEL: add_small_const:
@@ -40,8 +40,8 @@ define signext i32 @add_small_const(i32 signext %a) nounwind {
; RV64C-LABEL: add_small_const:
; RV64C: # %bb.0:
; RV64C-NEXT: c.addi a0, 1
-; RV64C-NEXT: c.slli a0, 56
-; RV64C-NEXT: c.srai a0, 56
+; RV64C-NEXT: c.slli a0, 24
+; RV64C-NEXT: sraiw a0, a0, 24
; RV64C-NEXT: c.jr ra
%1 = add i32 %a, 1
%2 = shl i32 %1, 24
@@ -60,11 +60,10 @@ define signext i32 @add_large_const(i32 signext %a) nounwind {
;
; RV64I-LABEL: add_large_const:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a0, a0, 48
-; RV64I-NEXT: lui a1, 4095
-; RV64I-NEXT: slli a1, a1, 36
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: lui a1, 65520
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srai a0, a0, 48
+; RV64I-NEXT: sraiw a0, a0, 16
; RV64I-NEXT: jalr zero, 0(ra)
;
; RV32C-LABEL: add_large_const:
@@ -77,11 +76,10 @@ define signext i32 @add_large_const(i32 signext %a) nounwind {
;
; RV64C-LABEL: add_large_const:
; RV64C: # %bb.0:
-; RV64C-NEXT: c.lui a1, 1
-; RV64C-NEXT: c.addi a1, -1
+; RV64C-NEXT: c.slli a0, 16
+; RV64C-NEXT: lui a1, 65520
; RV64C-NEXT: c.add a0, a1
-; RV64C-NEXT: c.slli a0, 48
-; RV64C-NEXT: c.srai a0, 48
+; RV64C-NEXT: sraiw a0, a0, 16
; RV64C-NEXT: c.jr ra
%1 = add i32 %a, 4095
%2 = shl i32 %1, 16
@@ -100,11 +98,10 @@ define signext i32 @add_huge_const(i32 signext %a) nounwind {
;
; RV64I-LABEL: add_huge_const:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a0, a0, 48
-; RV64I-NEXT: lui a1, 32767
-; RV64I-NEXT: slli a1, a1, 36
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: lui a1, 524272
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srai a0, a0, 48
+; RV64I-NEXT: sraiw a0, a0, 16
; RV64I-NEXT: jalr zero, 0(ra)
;
; RV32C-LABEL: add_huge_const:
@@ -117,11 +114,10 @@ define signext i32 @add_huge_const(i32 signext %a) nounwind {
;
; RV64C-LABEL: add_huge_const:
; RV64C: # %bb.0:
-; RV64C-NEXT: c.lui a1, 8
-; RV64C-NEXT: c.addi a1, -1
+; RV64C-NEXT: c.slli a0, 16
+; RV64C-NEXT: lui a1, 524272
; RV64C-NEXT: c.add a0, a1
-; RV64C-NEXT: c.slli a0, 48
-; RV64C-NEXT: c.srai a0, 48
+; RV64C-NEXT: sraiw a0, a0, 16
; RV64C-NEXT: c.jr ra
%1 = add i32 %a, 32767
%2 = shl i32 %1, 16
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
index 18b66499b85fe3..c15711586f9265 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
@@ -95,6 +95,7 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
; RV32IA-LABEL: cmpxchg_masked_and_branch1:
; RV32IA: # %bb.0: # %entry
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a4, a0, 3
; RV32IA-NEXT: li a0, 255
; RV32IA-NEXT: sll a0, a0, a4
@@ -125,6 +126,7 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
; RV32IA-ZACAS-LABEL: cmpxchg_masked_and_branch1:
; RV32IA-ZACAS: # %bb.0: # %entry
; RV32IA-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-ZACAS-NEXT: slli a4, a0, 3
; RV32IA-ZACAS-NEXT: li a0, 255
; RV32IA-ZACAS-NEXT: sll a0, a0, a4
@@ -155,6 +157,7 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
; RV64IA-LABEL: cmpxchg_masked_and_branch1:
; RV64IA: # %bb.0: # %entry
; RV64IA-NEXT: andi a3, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a4, a0, 3
; RV64IA-NEXT: li a0, 255
; RV64IA-NEXT: sllw a0, a0, a4
@@ -185,6 +188,7 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
; RV64IA-ZACAS-LABEL: cmpxchg_masked_and_branch1:
; RV64IA-ZACAS: # %bb.0: # %entry
; RV64IA-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a4, a0, 3
; RV64IA-ZACAS-NEXT: li a0, 255
; RV64IA-ZACAS-NEXT: sllw a0, a0, a4
@@ -214,11 +218,13 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
;
; RV64IA-ZABHA-LABEL: cmpxchg_masked_and_branch1:
; RV64IA-ZABHA: # %bb.0: # %entry
+; RV64IA-ZABHA-NEXT: slli a3, a1, 56
+; RV64IA-ZABHA-NEXT: srai a3, a3, 56
; RV64IA-ZABHA-NEXT: .LBB2_1: # %do_cmpxchg
; RV64IA-ZABHA-NEXT: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZABHA-NEXT: mv a3, a1
-; RV64IA-ZABHA-NEXT: amocas.b.aqrl a3, a2, (a0)
-; RV64IA-ZABHA-NEXT: bne a3, a1, .LBB2_1
+; RV64IA-ZABHA-NEXT: mv a4, a1
+; RV64IA-ZABHA-NEXT: amocas.b.aqrl a4, a2, (a0)
+; RV64IA-ZABHA-NEXT: bne a4, a3, .LBB2_1
; RV64IA-ZABHA-NEXT: # %bb.2: # %exit
; RV64IA-ZABHA-NEXT: ret
entry:
@@ -235,6 +241,7 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
; RV32IA-LABEL: cmpxchg_masked_and_branch2:
; RV32IA: # %bb.0: # %entry
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a4, a0, 3
; RV32IA-NEXT: li a0, 255
; RV32IA-NEXT: sll a0, a0, a4
@@ -268,6 +275,7 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
; RV32IA-ZACAS-LABEL: cmpxchg_masked_and_branch2:
; RV32IA-ZACAS: # %bb.0: # %entry
; RV32IA-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-ZACAS-NEXT: slli a4, a0, 3
; RV32IA-ZACAS-NEXT: li a0, 255
; RV32IA-ZACAS-NEXT: sll a0, a0, a4
@@ -301,6 +309,7 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
; RV64IA-LABEL: cmpxchg_masked_and_branch2:
; RV64IA: # %bb.0: # %entry
; RV64IA-NEXT: andi a3, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a4, a0, 3
; RV64IA-NEXT: li a0, 255
; RV64IA-NEXT: sllw a0, a0, a4
@@ -334,6 +343,7 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
; RV64IA-ZACAS-LABEL: cmpxchg_masked_and_branch2:
; RV64IA-ZACAS: # %bb.0: # %entry
; RV64IA-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a4, a0, 3
; RV64IA-ZACAS-NEXT: li a0, 255
; RV64IA-ZACAS-NEXT: sllw a0, a0, a4
@@ -366,11 +376,13 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
;
; RV64IA-ZABHA-LABEL: cmpxchg_masked_and_branch2:
; RV64IA-ZABHA: # %bb.0: # %entry
+; RV64IA-ZABHA-NEXT: slli a3, a1, 56
+; RV64IA-ZABHA-NEXT: srai a3, a3, 56
; RV64IA-ZABHA-NEXT: .LBB3_1: # %do_cmpxchg
; RV64IA-ZABHA-NEXT: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZABHA-NEXT: mv a3, a1
-; RV64IA-ZABHA-NEXT: amocas.b.aqrl a3, a2, (a0)
-; RV64IA-ZABHA-NEXT: beq a3, a1, .LBB3_1
+; RV64IA-ZABHA-NEXT: mv a4, a1
+; RV64IA-ZABHA-NEXT: amocas.b.aqrl a4, a2, (a0)
+; RV64IA-ZABHA-NEXT: beq a4, a3, .LBB3_1
; RV64IA-ZABHA-NEXT: # %bb.2: # %exit
; RV64IA-ZABHA-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
index 394dffa346ec63..be488863c985f6 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
@@ -41,6 +41,7 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
; RV32IA-LABEL: cmpxchg_i8_monotonic_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a4, 255
; RV32IA-NEXT: sll a4, a4, a0
@@ -77,6 +78,7 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
; RV64IA-WMO-LABEL: cmpxchg_i8_monotonic_monotonic:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: li a4, 255
; RV64IA-WMO-NEXT: sllw a4, a4, a0
@@ -100,6 +102,7 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
; RV64IA-ZACAS-LABEL: cmpxchg_i8_monotonic_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a4, 255
; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
@@ -128,6 +131,7 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
; RV64IA-TSO-LABEL: cmpxchg_i8_monotonic_monotonic:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: li a4, 255
; RV64IA-TSO-NEXT: sllw a4, a4, a0
@@ -168,6 +172,7 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-LABEL: cmpxchg_i8_acquire_monotonic:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a4, 255
; RV32IA-WMO-NEXT: sll a4, a4, a0
@@ -191,6 +196,7 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i8_acquire_monotonic:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: li a4, 255
; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0
@@ -214,6 +220,7 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-LABEL: cmpxchg_i8_acquire_monotonic:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a4, 255
; RV32IA-TSO-NEXT: sll a4, a4, a0
@@ -237,6 +244,7 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i8_acquire_monotonic:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: li a4, 255
; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0
@@ -273,6 +281,7 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-LABEL: cmpxchg_i8_acquire_monotonic:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: li a4, 255
; RV64IA-WMO-NEXT: sllw a4, a4, a0
@@ -296,6 +305,7 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i8_acquire_monotonic:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a4, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
@@ -324,6 +334,7 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-LABEL: cmpxchg_i8_acquire_monotonic:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: li a4, 255
; RV64IA-TSO-NEXT: sllw a4, a4, a0
@@ -347,6 +358,7 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i8_acquire_monotonic:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a4, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
@@ -392,6 +404,7 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-LABEL: cmpxchg_i8_acquire_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a4, 255
; RV32IA-WMO-NEXT: sll a4, a4, a0
@@ -415,6 +428,7 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i8_acquire_acquire:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: li a4, 255
; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0
@@ -438,6 +452,7 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-LABEL: cmpxchg_i8_acquire_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a4, 255
; RV32IA-TSO-NEXT: sll a4, a4, a0
@@ -461,6 +476,7 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i8_acquire_acquire:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: li a4, 255
; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0
@@ -497,6 +513,7 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-LABEL: cmpxchg_i8_acquire_acquire:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: li a4, 255
; RV64IA-WMO-NEXT: sllw a4, a4, a0
@@ -520,6 +537,7 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i8_acquire_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a4, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
@@ -548,6 +566,7 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-LABEL: cmpxchg_i8_acquire_acquire:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: li a4, 255
; RV64IA-TSO-NEXT: sllw a4, a4, a0
@@ -571,6 +590,7 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i8_acquire_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a4, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
@@ -616,6 +636,7 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-LABEL: cmpxchg_i8_release_monotonic:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a4, 255
; RV32IA-WMO-NEXT: sll a4, a4, a0
@@ -639,6 +660,7 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i8_release_monotonic:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: li a4, 255
; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0
@@ -662,6 +684,7 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-LABEL: cmpxchg_i8_release_monotonic:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a4, 255
; RV32IA-TSO-NEXT: sll a4, a4, a0
@@ -685,6 +708,7 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i8_release_monotonic:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: li a4, 255
; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0
@@ -721,6 +745,7 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-LABEL: cmpxchg_i8_release_monotonic:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: li a4, 255
; RV64IA-WMO-NEXT: sllw a4, a4, a0
@@ -744,6 +769,7 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i8_release_monotonic:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a4, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
@@ -772,6 +798,7 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-LABEL: cmpxchg_i8_release_monotonic:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: li a4, 255
; RV64IA-TSO-NEXT: sllw a4, a4, a0
@@ -795,6 +822,7 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i8_release_monotonic:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a4, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
@@ -840,6 +868,7 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-LABEL: cmpxchg_i8_release_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a4, 255
; RV32IA-WMO-NEXT: sll a4, a4, a0
@@ -863,6 +892,7 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i8_release_acquire:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: li a4, 255
; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0
@@ -886,6 +916,7 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-LABEL: cmpxchg_i8_release_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a4, 255
; RV32IA-TSO-NEXT: sll a4, a4, a0
@@ -909,6 +940,7 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i8_release_acquire:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: li a4, 255
; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0
@@ -945,6 +977,7 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-LABEL: cmpxchg_i8_release_acquire:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: li a4, 255
; RV64IA-WMO-NEXT: sllw a4, a4, a0
@@ -968,6 +1001,7 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i8_release_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a4, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
@@ -996,6 +1030,7 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-LABEL: cmpxchg_i8_release_acquire:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: li a4, 255
; RV64IA-TSO-NEXT: sllw a4, a4, a0
@@ -1019,6 +1054,7 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i8_release_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a4, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
@@ -1064,6 +1100,7 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-LABEL: cmpxchg_i8_acq_rel_monotonic:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a4, 255
; RV32IA-WMO-NEXT: sll a4, a4, a0
@@ -1087,6 +1124,7 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i8_acq_rel_monotonic:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: li a4, 255
; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0
@@ -1110,6 +1148,7 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-LABEL: cmpxchg_i8_acq_rel_monotonic:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a4, 255
; RV32IA-TSO-NEXT: sll a4, a4, a0
@@ -1133,6 +1172,7 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i8_acq_rel_monotonic:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: li a4, 255
; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0
@@ -1169,6 +1209,7 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-LABEL: cmpxchg_i8_acq_rel_monotonic:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: li a4, 255
; RV64IA-WMO-NEXT: sllw a4, a4, a0
@@ -1192,6 +1233,7 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i8_acq_rel_monotonic:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a4, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
@@ -1220,6 +1262,7 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-LABEL: cmpxchg_i8_acq_rel_monotonic:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: li a4, 255
; RV64IA-TSO-NEXT: sllw a4, a4, a0
@@ -1243,6 +1286,7 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i8_acq_rel_monotonic:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a4, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
@@ -1288,6 +1332,7 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-LABEL: cmpxchg_i8_acq_rel_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a4, 255
; RV32IA-WMO-NEXT: sll a4, a4, a0
@@ -1311,6 +1356,7 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i8_acq_rel_acquire:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: li a4, 255
; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0
@@ -1334,6 +1380,7 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-LABEL: cmpxchg_i8_acq_rel_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a4, 255
; RV32IA-TSO-NEXT: sll a4, a4, a0
@@ -1357,6 +1404,7 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i8_acq_rel_acquire:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: li a4, 255
; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0
@@ -1393,6 +1441,7 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-LABEL: cmpxchg_i8_acq_rel_acquire:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: li a4, 255
; RV64IA-WMO-NEXT: sllw a4, a4, a0
@@ -1416,6 +1465,7 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i8_acq_rel_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a4, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
@@ -1444,6 +1494,7 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-LABEL: cmpxchg_i8_acq_rel_acquire:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: li a4, 255
; RV64IA-TSO-NEXT: sllw a4, a4, a0
@@ -1467,6 +1518,7 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i8_acq_rel_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a4, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
@@ -1512,6 +1564,7 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-LABEL: cmpxchg_i8_seq_cst_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a4, 255
; RV32IA-NEXT: sll a4, a4, a0
@@ -1548,6 +1601,7 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-LABEL: cmpxchg_i8_seq_cst_monotonic:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: li a4, 255
; RV64IA-WMO-NEXT: sllw a4, a4, a0
@@ -1571,6 +1625,7 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-ZACAS-LABEL: cmpxchg_i8_seq_cst_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a4, 255
; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
@@ -1599,6 +1654,7 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-LABEL: cmpxchg_i8_seq_cst_monotonic:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: li a4, 255
; RV64IA-TSO-NEXT: sllw a4, a4, a0
@@ -1644,6 +1700,7 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-LABEL: cmpxchg_i8_seq_cst_acquire:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a4, 255
; RV32IA-NEXT: sll a4, a4, a0
@@ -1680,6 +1737,7 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-LABEL: cmpxchg_i8_seq_cst_acquire:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: li a4, 255
; RV64IA-WMO-NEXT: sllw a4, a4, a0
@@ -1703,6 +1761,7 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-ZACAS-LABEL: cmpxchg_i8_seq_cst_acquire:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a4, 255
; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
@@ -1731,6 +1790,7 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-LABEL: cmpxchg_i8_seq_cst_acquire:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: li a4, 255
; RV64IA-TSO-NEXT: sllw a4, a4, a0
@@ -1776,6 +1836,7 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV32IA-LABEL: cmpxchg_i8_seq_cst_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a4, 255
; RV32IA-NEXT: sll a4, a4, a0
@@ -1812,6 +1873,7 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-WMO-LABEL: cmpxchg_i8_seq_cst_seq_cst:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: li a4, 255
; RV64IA-WMO-NEXT: sllw a4, a4, a0
@@ -1835,6 +1897,7 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-ZACAS-LABEL: cmpxchg_i8_seq_cst_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a4, 255
; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
@@ -1863,6 +1926,7 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; RV64IA-TSO-LABEL: cmpxchg_i8_seq_cst_seq_cst:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: li a4, 255
; RV64IA-TSO-NEXT: sllw a4, a4, a0
@@ -1908,6 +1972,7 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
; RV32IA-LABEL: cmpxchg_i16_monotonic_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a4, 16
; RV32IA-NEXT: addi a4, a4, -1
@@ -1945,6 +2010,7 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
; RV64IA-WMO-LABEL: cmpxchg_i16_monotonic_monotonic:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: lui a4, 16
; RV64IA-WMO-NEXT: addi a4, a4, -1
@@ -1969,6 +2035,7 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
; RV64IA-ZACAS-LABEL: cmpxchg_i16_monotonic_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a4, 16
; RV64IA-ZACAS-NEXT: addi a4, a4, -1
@@ -1998,6 +2065,7 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
; RV64IA-TSO-LABEL: cmpxchg_i16_monotonic_monotonic:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: lui a4, 16
; RV64IA-TSO-NEXT: addi a4, a4, -1
@@ -2039,6 +2107,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-WMO-LABEL: cmpxchg_i16_acquire_monotonic:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a4, 16
; RV32IA-WMO-NEXT: addi a4, a4, -1
@@ -2063,6 +2132,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i16_acquire_monotonic:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: lui a4, 16
; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -2087,6 +2157,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-TSO-LABEL: cmpxchg_i16_acquire_monotonic:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a4, 16
; RV32IA-TSO-NEXT: addi a4, a4, -1
@@ -2111,6 +2182,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i16_acquire_monotonic:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: lui a4, 16
; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -2148,6 +2220,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-WMO-LABEL: cmpxchg_i16_acquire_monotonic:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: lui a4, 16
; RV64IA-WMO-NEXT: addi a4, a4, -1
@@ -2172,6 +2245,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i16_acquire_monotonic:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -2201,6 +2275,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-TSO-LABEL: cmpxchg_i16_acquire_monotonic:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: lui a4, 16
; RV64IA-TSO-NEXT: addi a4, a4, -1
@@ -2225,6 +2300,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i16_acquire_monotonic:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -2271,6 +2347,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-WMO-LABEL: cmpxchg_i16_acquire_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a4, 16
; RV32IA-WMO-NEXT: addi a4, a4, -1
@@ -2295,6 +2372,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i16_acquire_acquire:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: lui a4, 16
; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -2319,6 +2397,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-TSO-LABEL: cmpxchg_i16_acquire_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a4, 16
; RV32IA-TSO-NEXT: addi a4, a4, -1
@@ -2343,6 +2422,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i16_acquire_acquire:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: lui a4, 16
; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -2380,6 +2460,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-WMO-LABEL: cmpxchg_i16_acquire_acquire:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: lui a4, 16
; RV64IA-WMO-NEXT: addi a4, a4, -1
@@ -2404,6 +2485,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i16_acquire_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -2433,6 +2515,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-TSO-LABEL: cmpxchg_i16_acquire_acquire:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: lui a4, 16
; RV64IA-TSO-NEXT: addi a4, a4, -1
@@ -2457,6 +2540,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i16_acquire_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -2503,6 +2587,7 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-WMO-LABEL: cmpxchg_i16_release_monotonic:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a4, 16
; RV32IA-WMO-NEXT: addi a4, a4, -1
@@ -2527,6 +2612,7 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i16_release_monotonic:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: lui a4, 16
; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -2551,6 +2637,7 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-TSO-LABEL: cmpxchg_i16_release_monotonic:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a4, 16
; RV32IA-TSO-NEXT: addi a4, a4, -1
@@ -2575,6 +2662,7 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i16_release_monotonic:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: lui a4, 16
; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -2612,6 +2700,7 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-WMO-LABEL: cmpxchg_i16_release_monotonic:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: lui a4, 16
; RV64IA-WMO-NEXT: addi a4, a4, -1
@@ -2636,6 +2725,7 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i16_release_monotonic:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -2665,6 +2755,7 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-TSO-LABEL: cmpxchg_i16_release_monotonic:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: lui a4, 16
; RV64IA-TSO-NEXT: addi a4, a4, -1
@@ -2689,6 +2780,7 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i16_release_monotonic:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -2735,6 +2827,7 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-WMO-LABEL: cmpxchg_i16_release_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a4, 16
; RV32IA-WMO-NEXT: addi a4, a4, -1
@@ -2759,6 +2852,7 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i16_release_acquire:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: lui a4, 16
; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -2783,6 +2877,7 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-TSO-LABEL: cmpxchg_i16_release_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a4, 16
; RV32IA-TSO-NEXT: addi a4, a4, -1
@@ -2807,6 +2902,7 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i16_release_acquire:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: lui a4, 16
; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -2844,6 +2940,7 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-WMO-LABEL: cmpxchg_i16_release_acquire:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: lui a4, 16
; RV64IA-WMO-NEXT: addi a4, a4, -1
@@ -2868,6 +2965,7 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i16_release_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -2897,6 +2995,7 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-TSO-LABEL: cmpxchg_i16_release_acquire:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: lui a4, 16
; RV64IA-TSO-NEXT: addi a4, a4, -1
@@ -2921,6 +3020,7 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i16_release_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -2967,6 +3067,7 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-WMO-LABEL: cmpxchg_i16_acq_rel_monotonic:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a4, 16
; RV32IA-WMO-NEXT: addi a4, a4, -1
@@ -2991,6 +3092,7 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i16_acq_rel_monotonic:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: lui a4, 16
; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -3015,6 +3117,7 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-TSO-LABEL: cmpxchg_i16_acq_rel_monotonic:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a4, 16
; RV32IA-TSO-NEXT: addi a4, a4, -1
@@ -3039,6 +3142,7 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i16_acq_rel_monotonic:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: lui a4, 16
; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -3076,6 +3180,7 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-WMO-LABEL: cmpxchg_i16_acq_rel_monotonic:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: lui a4, 16
; RV64IA-WMO-NEXT: addi a4, a4, -1
@@ -3100,6 +3205,7 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i16_acq_rel_monotonic:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -3129,6 +3235,7 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-TSO-LABEL: cmpxchg_i16_acq_rel_monotonic:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: lui a4, 16
; RV64IA-TSO-NEXT: addi a4, a4, -1
@@ -3153,6 +3260,7 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i16_acq_rel_monotonic:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -3199,6 +3307,7 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-WMO-LABEL: cmpxchg_i16_acq_rel_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a3, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a4, 16
; RV32IA-WMO-NEXT: addi a4, a4, -1
@@ -3223,6 +3332,7 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-WMO-ZACAS-LABEL: cmpxchg_i16_acq_rel_acquire:
; RV32IA-WMO-ZACAS: # %bb.0:
; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-WMO-ZACAS-NEXT: lui a4, 16
; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -3247,6 +3357,7 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-TSO-LABEL: cmpxchg_i16_acq_rel_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a3, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a4, 16
; RV32IA-TSO-NEXT: addi a4, a4, -1
@@ -3271,6 +3382,7 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-TSO-ZACAS-LABEL: cmpxchg_i16_acq_rel_acquire:
; RV32IA-TSO-ZACAS: # %bb.0:
; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV32IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV32IA-TSO-ZACAS-NEXT: lui a4, 16
; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -3308,6 +3420,7 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-WMO-LABEL: cmpxchg_i16_acq_rel_acquire:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: lui a4, 16
; RV64IA-WMO-NEXT: addi a4, a4, -1
@@ -3332,6 +3445,7 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-WMO-ZACAS-LABEL: cmpxchg_i16_acq_rel_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
@@ -3361,6 +3475,7 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-TSO-LABEL: cmpxchg_i16_acq_rel_acquire:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: lui a4, 16
; RV64IA-TSO-NEXT: addi a4, a4, -1
@@ -3385,6 +3500,7 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-TSO-ZACAS-LABEL: cmpxchg_i16_acq_rel_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
@@ -3431,6 +3547,7 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV32IA-LABEL: cmpxchg_i16_seq_cst_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a4, 16
; RV32IA-NEXT: addi a4, a4, -1
@@ -3468,6 +3585,7 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-WMO-LABEL: cmpxchg_i16_seq_cst_monotonic:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: lui a4, 16
; RV64IA-WMO-NEXT: addi a4, a4, -1
@@ -3492,6 +3610,7 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-ZACAS-LABEL: cmpxchg_i16_seq_cst_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a4, 16
; RV64IA-ZACAS-NEXT: addi a4, a4, -1
@@ -3521,6 +3640,7 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
; RV64IA-TSO-LABEL: cmpxchg_i16_seq_cst_monotonic:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: lui a4, 16
; RV64IA-TSO-NEXT: addi a4, a4, -1
@@ -3567,6 +3687,7 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-LABEL: cmpxchg_i16_seq_cst_acquire:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a4, 16
; RV32IA-NEXT: addi a4, a4, -1
@@ -3604,6 +3725,7 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-WMO-LABEL: cmpxchg_i16_seq_cst_acquire:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: lui a4, 16
; RV64IA-WMO-NEXT: addi a4, a4, -1
@@ -3628,6 +3750,7 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-ZACAS-LABEL: cmpxchg_i16_seq_cst_acquire:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a4, 16
; RV64IA-ZACAS-NEXT: addi a4, a4, -1
@@ -3657,6 +3780,7 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-TSO-LABEL: cmpxchg_i16_seq_cst_acquire:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: lui a4, 16
; RV64IA-TSO-NEXT: addi a4, a4, -1
@@ -3703,6 +3827,7 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV32IA-LABEL: cmpxchg_i16_seq_cst_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a4, 16
; RV32IA-NEXT: addi a4, a4, -1
@@ -3740,6 +3865,7 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-WMO-LABEL: cmpxchg_i16_seq_cst_seq_cst:
; RV64IA-WMO: # %bb.0:
; RV64IA-WMO-NEXT: andi a3, a0, -4
+; RV64IA-WMO-NEXT: andi a0, a0, 3
; RV64IA-WMO-NEXT: slli a0, a0, 3
; RV64IA-WMO-NEXT: lui a4, 16
; RV64IA-WMO-NEXT: addi a4, a4, -1
@@ -3764,6 +3890,7 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-ZACAS-LABEL: cmpxchg_i16_seq_cst_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a3, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a4, 16
; RV64IA-ZACAS-NEXT: addi a4, a4, -1
@@ -3793,6 +3920,7 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
; RV64IA-TSO-LABEL: cmpxchg_i16_seq_cst_seq_cst:
; RV64IA-TSO: # %bb.0:
; RV64IA-TSO-NEXT: andi a3, a0, -4
+; RV64IA-TSO-NEXT: andi a0, a0, 3
; RV64IA-TSO-NEXT: slli a0, a0, 3
; RV64IA-TSO-NEXT: lui a4, 16
; RV64IA-TSO-NEXT: addi a4, a4, -1
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index 4871622ec23637..dc6e9ec645fb5f 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -44,6 +44,7 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_xchg_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -74,6 +75,7 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -94,6 +96,7 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_xchg_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -138,6 +141,7 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -158,6 +162,7 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -188,6 +193,7 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -208,6 +214,7 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -228,6 +235,7 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -248,6 +256,7 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -292,6 +301,7 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -312,6 +322,7 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -342,6 +353,7 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -362,6 +374,7 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -382,6 +395,7 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -402,6 +416,7 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -446,6 +461,7 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -466,6 +482,7 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -496,6 +513,7 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -516,6 +534,7 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -536,6 +555,7 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -556,6 +576,7 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -600,6 +621,7 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_xchg_i8_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -630,6 +652,7 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_i8_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -650,6 +673,7 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_xchg_i8_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -698,6 +722,7 @@ define i8 @atomicrmw_xchg_0_i8_monotonic(ptr %a) nounwind {
; RV32IA-LABEL: atomicrmw_xchg_0_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a1, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a2, 255
; RV32IA-NEXT: sll a2, a2, a0
@@ -720,6 +745,7 @@ define i8 @atomicrmw_xchg_0_i8_monotonic(ptr %a) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_0_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a2, 255
; RV64IA-NOZACAS-NEXT: sllw a2, a2, a0
@@ -731,6 +757,7 @@ define i8 @atomicrmw_xchg_0_i8_monotonic(ptr %a) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_xchg_0_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a2, 255
; RV64IA-ZACAS-NEXT: sllw a2, a2, a0
@@ -767,6 +794,7 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a2, 255
; RV32IA-WMO-NEXT: sll a2, a2, a0
@@ -778,6 +806,7 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a2, 255
; RV32IA-TSO-NEXT: sll a2, a2, a0
@@ -800,6 +829,7 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a2, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -811,6 +841,7 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a2, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -822,6 +853,7 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_0_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a2, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a2, a2, a0
@@ -833,6 +865,7 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_0_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a2, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a2, a2, a0
@@ -869,6 +902,7 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a2, 255
; RV32IA-WMO-NEXT: sll a2, a2, a0
@@ -880,6 +914,7 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a2, 255
; RV32IA-TSO-NEXT: sll a2, a2, a0
@@ -902,6 +937,7 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a2, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -913,6 +949,7 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a2, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -924,6 +961,7 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_0_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a2, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a2, a2, a0
@@ -935,6 +973,7 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_0_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a2, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a2, a2, a0
@@ -971,6 +1010,7 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a2, 255
; RV32IA-WMO-NEXT: sll a2, a2, a0
@@ -982,6 +1022,7 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a2, 255
; RV32IA-TSO-NEXT: sll a2, a2, a0
@@ -1004,6 +1045,7 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a2, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1015,6 +1057,7 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a2, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1026,6 +1069,7 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_0_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a2, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1037,6 +1081,7 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_0_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a2, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1073,6 +1118,7 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i8_seq_cst:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a2, 255
; RV32IA-WMO-NEXT: sll a2, a2, a0
@@ -1084,6 +1130,7 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i8_seq_cst:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a2, 255
; RV32IA-TSO-NEXT: sll a2, a2, a0
@@ -1106,6 +1153,7 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_seq_cst:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a2, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1117,6 +1165,7 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_seq_cst:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a2, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1128,6 +1177,7 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_0_i8_seq_cst:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a2, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1139,6 +1189,7 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_0_i8_seq_cst:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a2, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1175,6 +1226,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_monotonic(ptr %a) nounwind {
; RV32IA-LABEL: atomicrmw_xchg_minus_1_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a1, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a2, 255
; RV32IA-NEXT: sll a2, a2, a0
@@ -1196,6 +1248,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_monotonic(ptr %a) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a2, 255
; RV64IA-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1206,6 +1259,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_monotonic(ptr %a) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_xchg_minus_1_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a2, 255
; RV64IA-ZACAS-NEXT: sllw a2, a2, a0
@@ -1243,6 +1297,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a2, 255
; RV32IA-WMO-NEXT: sll a2, a2, a0
@@ -1253,6 +1308,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a2, 255
; RV32IA-TSO-NEXT: sll a2, a2, a0
@@ -1274,6 +1330,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a2, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1284,6 +1341,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a2, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1294,6 +1352,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a2, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1304,6 +1363,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a2, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1341,6 +1401,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a2, 255
; RV32IA-WMO-NEXT: sll a2, a2, a0
@@ -1351,6 +1412,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a2, 255
; RV32IA-TSO-NEXT: sll a2, a2, a0
@@ -1372,6 +1434,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a2, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1382,6 +1445,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a2, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1392,6 +1456,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a2, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1402,6 +1467,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a2, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1439,6 +1505,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a2, 255
; RV32IA-WMO-NEXT: sll a2, a2, a0
@@ -1449,6 +1516,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a2, 255
; RV32IA-TSO-NEXT: sll a2, a2, a0
@@ -1470,6 +1538,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a2, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1480,6 +1549,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a2, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1490,6 +1560,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a2, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1500,6 +1571,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a2, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1537,6 +1609,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a2, 255
; RV32IA-WMO-NEXT: sll a2, a2, a0
@@ -1547,6 +1620,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a2, 255
; RV32IA-TSO-NEXT: sll a2, a2, a0
@@ -1568,6 +1642,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a2, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1578,6 +1653,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a2, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a2, a2, a0
@@ -1588,6 +1664,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a2, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1598,6 +1675,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a2, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a2, a2, a0
@@ -1634,6 +1712,7 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_add_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -1664,6 +1743,7 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_add_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -1684,6 +1764,7 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_add_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -1728,6 +1809,7 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_add_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -1748,6 +1830,7 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_add_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -1778,6 +1861,7 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -1798,6 +1882,7 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_add_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -1818,6 +1903,7 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_add_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -1838,6 +1924,7 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_add_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -1882,6 +1969,7 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_add_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -1902,6 +1990,7 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_add_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -1932,6 +2021,7 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -1952,6 +2042,7 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_add_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -1972,6 +2063,7 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_add_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -1992,6 +2084,7 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_add_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -2036,6 +2129,7 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_add_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -2056,6 +2150,7 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_add_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -2086,6 +2181,7 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2106,6 +2202,7 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_add_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2126,6 +2223,7 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_add_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -2146,6 +2244,7 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_add_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -2190,6 +2289,7 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_add_i8_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -2220,6 +2320,7 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_add_i8_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2240,6 +2341,7 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_add_i8_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -2284,6 +2386,7 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_sub_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -2314,6 +2417,7 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_sub_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2334,6 +2438,7 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_sub_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -2380,6 +2485,7 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_sub_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -2400,6 +2506,7 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_sub_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -2430,6 +2537,7 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2450,6 +2558,7 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_sub_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2470,6 +2579,7 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_sub_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -2490,6 +2600,7 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_sub_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -2536,6 +2647,7 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_sub_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -2556,6 +2668,7 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_sub_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -2586,6 +2699,7 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2606,6 +2720,7 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_sub_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2626,6 +2741,7 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_sub_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -2646,6 +2762,7 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_sub_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -2692,6 +2809,7 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_sub_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -2712,6 +2830,7 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_sub_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -2742,6 +2861,7 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2762,6 +2882,7 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_sub_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2782,6 +2903,7 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_sub_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -2802,6 +2924,7 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_sub_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -2848,6 +2971,7 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_sub_i8_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -2878,6 +3002,7 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_sub_i8_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2898,6 +3023,7 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_sub_i8_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -2944,6 +3070,7 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_and_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -2968,6 +3095,7 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_and_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -2982,6 +3110,7 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_and_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -3020,6 +3149,7 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_and_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -3034,6 +3164,7 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_and_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -3058,6 +3189,7 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3072,6 +3204,7 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_and_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3086,6 +3219,7 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_and_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3100,6 +3234,7 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_and_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3138,6 +3273,7 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_and_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -3152,6 +3288,7 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_and_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -3176,6 +3313,7 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3190,6 +3328,7 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_and_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3204,6 +3343,7 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_and_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3218,6 +3358,7 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_and_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3256,6 +3397,7 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_and_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -3270,6 +3412,7 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_and_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -3294,6 +3437,7 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3308,6 +3452,7 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_and_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3322,6 +3467,7 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_and_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3336,6 +3482,7 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_and_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3374,6 +3521,7 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_and_i8_seq_cst:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -3388,6 +3536,7 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_and_i8_seq_cst:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -3412,6 +3561,7 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_seq_cst:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3426,6 +3576,7 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_and_i8_seq_cst:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3440,6 +3591,7 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_and_i8_seq_cst:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3454,6 +3606,7 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_and_i8_seq_cst:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3492,6 +3645,7 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_nand_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -3523,6 +3677,7 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3544,6 +3699,7 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_nand_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -3565,6 +3721,7 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i8_monotonic:
; RV64IA-WMO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3586,6 +3743,7 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i8_monotonic:
; RV64IA-TSO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3651,6 +3809,7 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_nand_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -3672,6 +3831,7 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_nand_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -3703,6 +3863,7 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3724,6 +3885,7 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_nand_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3745,6 +3907,7 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_nand_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3766,6 +3929,7 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_nand_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3787,6 +3951,7 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i8_acquire:
; RV64IA-WMO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3808,6 +3973,7 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i8_acquire:
; RV64IA-TSO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3873,6 +4039,7 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_nand_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -3894,6 +4061,7 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_nand_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -3925,6 +4093,7 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3946,6 +4115,7 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_nand_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -3967,6 +4137,7 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_nand_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -3988,6 +4159,7 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_nand_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -4009,6 +4181,7 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i8_release:
; RV64IA-WMO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -4030,6 +4203,7 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i8_release:
; RV64IA-TSO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -4095,6 +4269,7 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_nand_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -4116,6 +4291,7 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_nand_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -4147,6 +4323,7 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -4168,6 +4345,7 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_nand_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -4189,6 +4367,7 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_nand_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -4210,6 +4389,7 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_nand_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -4231,6 +4411,7 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i8_acq_rel:
; RV64IA-WMO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -4252,6 +4433,7 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i8_acq_rel:
; RV64IA-TSO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -4317,6 +4499,7 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_nand_i8_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -4348,6 +4531,7 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i8_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -4369,6 +4553,7 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_nand_i8_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -4390,6 +4575,7 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i8_seq_cst:
; RV64IA-WMO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -4411,6 +4597,7 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i8_seq_cst:
; RV64IA-TSO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -4476,6 +4663,7 @@ define i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_or_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: andi a1, a1, 255
; RV32IA-NEXT: sll a1, a1, a0
@@ -4496,6 +4684,7 @@ define i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_or_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0
@@ -4506,6 +4695,7 @@ define i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_or_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-ZACAS-NEXT: sllw a1, a1, a0
@@ -4540,6 +4730,7 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_or_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: andi a1, a1, 255
; RV32IA-WMO-NEXT: sll a1, a1, a0
@@ -4550,6 +4741,7 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_or_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: andi a1, a1, 255
; RV32IA-TSO-NEXT: sll a1, a1, a0
@@ -4570,6 +4762,7 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -4580,6 +4773,7 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_or_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -4590,6 +4784,7 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_or_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
@@ -4600,6 +4795,7 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_or_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
@@ -4634,6 +4830,7 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_or_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: andi a1, a1, 255
; RV32IA-WMO-NEXT: sll a1, a1, a0
@@ -4644,6 +4841,7 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_or_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: andi a1, a1, 255
; RV32IA-TSO-NEXT: sll a1, a1, a0
@@ -4664,6 +4862,7 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -4674,6 +4873,7 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_or_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -4684,6 +4884,7 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_or_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
@@ -4694,6 +4895,7 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_or_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
@@ -4728,6 +4930,7 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_or_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: andi a1, a1, 255
; RV32IA-WMO-NEXT: sll a1, a1, a0
@@ -4738,6 +4941,7 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_or_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: andi a1, a1, 255
; RV32IA-TSO-NEXT: sll a1, a1, a0
@@ -4758,6 +4962,7 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -4768,6 +4973,7 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_or_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -4778,6 +4984,7 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_or_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
@@ -4788,6 +4995,7 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_or_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
@@ -4822,6 +5030,7 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_or_i8_seq_cst:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: andi a1, a1, 255
; RV32IA-WMO-NEXT: sll a1, a1, a0
@@ -4832,6 +5041,7 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_or_i8_seq_cst:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: andi a1, a1, 255
; RV32IA-TSO-NEXT: sll a1, a1, a0
@@ -4852,6 +5062,7 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_seq_cst:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -4862,6 +5073,7 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_or_i8_seq_cst:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -4872,6 +5084,7 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_or_i8_seq_cst:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
@@ -4882,6 +5095,7 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_or_i8_seq_cst:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
@@ -4916,6 +5130,7 @@ define i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_xor_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: andi a1, a1, 255
; RV32IA-NEXT: sll a1, a1, a0
@@ -4936,6 +5151,7 @@ define i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_xor_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0
@@ -4946,6 +5162,7 @@ define i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_xor_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-ZACAS-NEXT: sllw a1, a1, a0
@@ -4980,6 +5197,7 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xor_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: andi a1, a1, 255
; RV32IA-WMO-NEXT: sll a1, a1, a0
@@ -4990,6 +5208,7 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xor_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: andi a1, a1, 255
; RV32IA-TSO-NEXT: sll a1, a1, a0
@@ -5010,6 +5229,7 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -5020,6 +5240,7 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xor_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -5030,6 +5251,7 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xor_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
@@ -5040,6 +5262,7 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xor_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
@@ -5074,6 +5297,7 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xor_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: andi a1, a1, 255
; RV32IA-WMO-NEXT: sll a1, a1, a0
@@ -5084,6 +5308,7 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xor_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: andi a1, a1, 255
; RV32IA-TSO-NEXT: sll a1, a1, a0
@@ -5104,6 +5329,7 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -5114,6 +5340,7 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xor_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -5124,6 +5351,7 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xor_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
@@ -5134,6 +5362,7 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xor_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
@@ -5168,6 +5397,7 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xor_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: andi a1, a1, 255
; RV32IA-WMO-NEXT: sll a1, a1, a0
@@ -5178,6 +5408,7 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xor_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: andi a1, a1, 255
; RV32IA-TSO-NEXT: sll a1, a1, a0
@@ -5198,6 +5429,7 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -5208,6 +5440,7 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xor_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -5218,6 +5451,7 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xor_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
@@ -5228,6 +5462,7 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xor_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
@@ -5262,6 +5497,7 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xor_i8_seq_cst:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: andi a1, a1, 255
; RV32IA-WMO-NEXT: sll a1, a1, a0
@@ -5272,6 +5508,7 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xor_i8_seq_cst:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: andi a1, a1, 255
; RV32IA-TSO-NEXT: sll a1, a1, a0
@@ -5292,6 +5529,7 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_seq_cst:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -5302,6 +5540,7 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xor_i8_seq_cst:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
@@ -5312,6 +5551,7 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xor_i8_seq_cst:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
@@ -5322,6 +5562,7 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xor_i8_seq_cst:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
@@ -5388,24 +5629,24 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_max_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: li a4, 255
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: li a3, 255
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 24
; RV32IA-NEXT: srai a1, a1, 24
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: xori a3, a3, 24
+; RV32IA-NEXT: xori a4, a0, 24
; RV32IA-NEXT: .LBB45_1: # =>This Inner Loop Header: Depth=1
; RV32IA-NEXT: lr.w a5, (a2)
-; RV32IA-NEXT: and a7, a5, a4
+; RV32IA-NEXT: and a7, a5, a3
; RV32IA-NEXT: mv a6, a5
-; RV32IA-NEXT: sll a7, a7, a3
-; RV32IA-NEXT: sra a7, a7, a3
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a7, a1, .LBB45_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB45_1 Depth=1
; RV32IA-NEXT: xor a6, a5, a1
-; RV32IA-NEXT: and a6, a6, a4
+; RV32IA-NEXT: and a6, a6, a3
; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB45_3: # in Loop: Header=BB45_1 Depth=1
; RV32IA-NEXT: sc.w a6, a6, (a2)
@@ -5459,24 +5700,24 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_max_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT: li a4, 255
-; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT: li a3, 255
+; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-NOZACAS-NEXT: .LBB45_1: # =>This Inner Loop Header: Depth=1
; RV64IA-NOZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-NOZACAS-NEXT: mv a6, a5
-; RV64IA-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-NOZACAS-NEXT: bge a7, a1, .LBB45_3
; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB45_1 Depth=1
; RV64IA-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-NOZACAS-NEXT: .LBB45_3: # in Loop: Header=BB45_1 Depth=1
; RV64IA-NOZACAS-NEXT: sc.w a6, a6, (a2)
@@ -5488,24 +5729,24 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_max_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-ZACAS-NEXT: li a4, 255
-; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT: li a3, 255
+; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-ZACAS-NEXT: .LBB45_1: # =>This Inner Loop Header: Depth=1
; RV64IA-ZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-ZACAS-NEXT: and a7, a5, a3
; RV64IA-ZACAS-NEXT: mv a6, a5
-; RV64IA-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-ZACAS-NEXT: bge a7, a1, .LBB45_3
; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB45_1 Depth=1
; RV64IA-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-ZACAS-NEXT: and a6, a6, a3
; RV64IA-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-ZACAS-NEXT: .LBB45_3: # in Loop: Header=BB45_1 Depth=1
; RV64IA-ZACAS-NEXT: sc.w a6, a6, (a2)
@@ -5573,24 +5814,24 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_max_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: li a4, 255
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: li a3, 255
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 24
; RV32IA-WMO-NEXT: srai a1, a1, 24
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: xori a3, a3, 24
+; RV32IA-WMO-NEXT: xori a4, a0, 24
; RV32IA-WMO-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1
; RV32IA-WMO-NEXT: lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT: and a7, a5, a4
+; RV32IA-WMO-NEXT: and a7, a5, a3
; RV32IA-WMO-NEXT: mv a6, a5
-; RV32IA-WMO-NEXT: sll a7, a7, a3
-; RV32IA-WMO-NEXT: sra a7, a7, a3
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a7, a1, .LBB46_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1
; RV32IA-WMO-NEXT: xor a6, a5, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
+; RV32IA-WMO-NEXT: and a6, a6, a3
; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1
; RV32IA-WMO-NEXT: sc.w a6, a6, (a2)
@@ -5602,24 +5843,24 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_max_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: li a4, 255
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: li a3, 255
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 24
; RV32IA-TSO-NEXT: srai a1, a1, 24
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: xori a3, a3, 24
+; RV32IA-TSO-NEXT: xori a4, a0, 24
; RV32IA-TSO-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1
; RV32IA-TSO-NEXT: lr.w a5, (a2)
-; RV32IA-TSO-NEXT: and a7, a5, a4
+; RV32IA-TSO-NEXT: and a7, a5, a3
; RV32IA-TSO-NEXT: mv a6, a5
-; RV32IA-TSO-NEXT: sll a7, a7, a3
-; RV32IA-TSO-NEXT: sra a7, a7, a3
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a7, a1, .LBB46_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1
; RV32IA-TSO-NEXT: xor a6, a5, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
+; RV32IA-TSO-NEXT: and a6, a6, a3
; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
@@ -5673,24 +5914,24 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-NOZACAS-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB46_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w a6, a6, (a2)
@@ -5702,24 +5943,24 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-NOZACAS-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB46_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
@@ -5731,24 +5972,24 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: li a4, 255
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: li a3, 255
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-ZACAS-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB46_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w a6, a6, (a2)
@@ -5760,24 +6001,24 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: li a4, 255
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: li a3, 255
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-ZACAS-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB46_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
@@ -5845,24 +6086,24 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_max_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: li a4, 255
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: li a3, 255
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 24
; RV32IA-WMO-NEXT: srai a1, a1, 24
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: xori a3, a3, 24
+; RV32IA-WMO-NEXT: xori a4, a0, 24
; RV32IA-WMO-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1
; RV32IA-WMO-NEXT: lr.w a5, (a2)
-; RV32IA-WMO-NEXT: and a7, a5, a4
+; RV32IA-WMO-NEXT: and a7, a5, a3
; RV32IA-WMO-NEXT: mv a6, a5
-; RV32IA-WMO-NEXT: sll a7, a7, a3
-; RV32IA-WMO-NEXT: sra a7, a7, a3
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a7, a1, .LBB47_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1
; RV32IA-WMO-NEXT: xor a6, a5, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
+; RV32IA-WMO-NEXT: and a6, a6, a3
; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1
; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2)
@@ -5874,24 +6115,24 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_max_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: li a4, 255
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: li a3, 255
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 24
; RV32IA-TSO-NEXT: srai a1, a1, 24
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: xori a3, a3, 24
+; RV32IA-TSO-NEXT: xori a4, a0, 24
; RV32IA-TSO-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1
; RV32IA-TSO-NEXT: lr.w a5, (a2)
-; RV32IA-TSO-NEXT: and a7, a5, a4
+; RV32IA-TSO-NEXT: and a7, a5, a3
; RV32IA-TSO-NEXT: mv a6, a5
-; RV32IA-TSO-NEXT: sll a7, a7, a3
-; RV32IA-TSO-NEXT: sra a7, a7, a3
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a7, a1, .LBB47_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1
; RV32IA-TSO-NEXT: xor a6, a5, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
+; RV32IA-TSO-NEXT: and a6, a6, a3
; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
@@ -5945,24 +6186,24 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-NOZACAS-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-NOZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB47_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -5974,24 +6215,24 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-NOZACAS-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB47_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
@@ -6003,24 +6244,24 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: li a4, 255
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: li a3, 255
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-ZACAS-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-ZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB47_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -6032,24 +6273,24 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: li a4, 255
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: li a3, 255
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-ZACAS-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB47_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
@@ -6117,24 +6358,24 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_max_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: li a4, 255
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: li a3, 255
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 24
; RV32IA-WMO-NEXT: srai a1, a1, 24
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: xori a3, a3, 24
+; RV32IA-WMO-NEXT: xori a4, a0, 24
; RV32IA-WMO-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1
; RV32IA-WMO-NEXT: lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT: and a7, a5, a4
+; RV32IA-WMO-NEXT: and a7, a5, a3
; RV32IA-WMO-NEXT: mv a6, a5
-; RV32IA-WMO-NEXT: sll a7, a7, a3
-; RV32IA-WMO-NEXT: sra a7, a7, a3
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a7, a1, .LBB48_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1
; RV32IA-WMO-NEXT: xor a6, a5, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
+; RV32IA-WMO-NEXT: and a6, a6, a3
; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1
; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2)
@@ -6146,24 +6387,24 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_max_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: li a4, 255
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: li a3, 255
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 24
; RV32IA-TSO-NEXT: srai a1, a1, 24
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: xori a3, a3, 24
+; RV32IA-TSO-NEXT: xori a4, a0, 24
; RV32IA-TSO-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1
; RV32IA-TSO-NEXT: lr.w a5, (a2)
-; RV32IA-TSO-NEXT: and a7, a5, a4
+; RV32IA-TSO-NEXT: and a7, a5, a3
; RV32IA-TSO-NEXT: mv a6, a5
-; RV32IA-TSO-NEXT: sll a7, a7, a3
-; RV32IA-TSO-NEXT: sra a7, a7, a3
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a7, a1, .LBB48_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1
; RV32IA-TSO-NEXT: xor a6, a5, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
+; RV32IA-TSO-NEXT: and a6, a6, a3
; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
@@ -6217,24 +6458,24 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-NOZACAS-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB48_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -6246,24 +6487,24 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-NOZACAS-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB48_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
@@ -6275,24 +6516,24 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: li a4, 255
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: li a3, 255
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-ZACAS-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB48_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -6304,24 +6545,24 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: li a4, 255
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: li a3, 255
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-ZACAS-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB48_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
@@ -6389,24 +6630,24 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_max_i8_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: li a4, 255
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: li a3, 255
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 24
; RV32IA-NEXT: srai a1, a1, 24
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: xori a3, a3, 24
+; RV32IA-NEXT: xori a4, a0, 24
; RV32IA-NEXT: .LBB49_1: # =>This Inner Loop Header: Depth=1
; RV32IA-NEXT: lr.w.aqrl a5, (a2)
-; RV32IA-NEXT: and a7, a5, a4
+; RV32IA-NEXT: and a7, a5, a3
; RV32IA-NEXT: mv a6, a5
-; RV32IA-NEXT: sll a7, a7, a3
-; RV32IA-NEXT: sra a7, a7, a3
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a7, a1, .LBB49_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB49_1 Depth=1
; RV32IA-NEXT: xor a6, a5, a1
-; RV32IA-NEXT: and a6, a6, a4
+; RV32IA-NEXT: and a6, a6, a3
; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB49_3: # in Loop: Header=BB49_1 Depth=1
; RV32IA-NEXT: sc.w.rl a6, a6, (a2)
@@ -6460,24 +6701,24 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_max_i8_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT: li a4, 255
-; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT: li a3, 255
+; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-NOZACAS-NEXT: .LBB49_1: # =>This Inner Loop Header: Depth=1
; RV64IA-NOZACAS-NEXT: lr.w.aqrl a5, (a2)
-; RV64IA-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-NOZACAS-NEXT: mv a6, a5
-; RV64IA-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-NOZACAS-NEXT: bge a7, a1, .LBB49_3
; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB49_1 Depth=1
; RV64IA-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-NOZACAS-NEXT: .LBB49_3: # in Loop: Header=BB49_1 Depth=1
; RV64IA-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -6489,24 +6730,24 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_max_i8_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-ZACAS-NEXT: li a4, 255
-; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT: li a3, 255
+; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-ZACAS-NEXT: .LBB49_1: # =>This Inner Loop Header: Depth=1
; RV64IA-ZACAS-NEXT: lr.w.aqrl a5, (a2)
-; RV64IA-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-ZACAS-NEXT: and a7, a5, a3
; RV64IA-ZACAS-NEXT: mv a6, a5
-; RV64IA-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-ZACAS-NEXT: bge a7, a1, .LBB49_3
; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB49_1 Depth=1
; RV64IA-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-ZACAS-NEXT: and a6, a6, a3
; RV64IA-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-ZACAS-NEXT: .LBB49_3: # in Loop: Header=BB49_1 Depth=1
; RV64IA-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -6574,24 +6815,24 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_min_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: li a4, 255
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: li a3, 255
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 24
; RV32IA-NEXT: srai a1, a1, 24
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: xori a3, a3, 24
+; RV32IA-NEXT: xori a4, a0, 24
; RV32IA-NEXT: .LBB50_1: # =>This Inner Loop Header: Depth=1
; RV32IA-NEXT: lr.w a5, (a2)
-; RV32IA-NEXT: and a7, a5, a4
+; RV32IA-NEXT: and a7, a5, a3
; RV32IA-NEXT: mv a6, a5
-; RV32IA-NEXT: sll a7, a7, a3
-; RV32IA-NEXT: sra a7, a7, a3
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a1, a7, .LBB50_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB50_1 Depth=1
; RV32IA-NEXT: xor a6, a5, a1
-; RV32IA-NEXT: and a6, a6, a4
+; RV32IA-NEXT: and a6, a6, a3
; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB50_3: # in Loop: Header=BB50_1 Depth=1
; RV32IA-NEXT: sc.w a6, a6, (a2)
@@ -6645,24 +6886,24 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_min_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT: li a4, 255
-; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT: li a3, 255
+; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-NOZACAS-NEXT: .LBB50_1: # =>This Inner Loop Header: Depth=1
; RV64IA-NOZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-NOZACAS-NEXT: mv a6, a5
-; RV64IA-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-NOZACAS-NEXT: bge a1, a7, .LBB50_3
; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB50_1 Depth=1
; RV64IA-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-NOZACAS-NEXT: .LBB50_3: # in Loop: Header=BB50_1 Depth=1
; RV64IA-NOZACAS-NEXT: sc.w a6, a6, (a2)
@@ -6674,24 +6915,24 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_min_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-ZACAS-NEXT: li a4, 255
-; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT: li a3, 255
+; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-ZACAS-NEXT: .LBB50_1: # =>This Inner Loop Header: Depth=1
; RV64IA-ZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-ZACAS-NEXT: and a7, a5, a3
; RV64IA-ZACAS-NEXT: mv a6, a5
-; RV64IA-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-ZACAS-NEXT: bge a1, a7, .LBB50_3
; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB50_1 Depth=1
; RV64IA-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-ZACAS-NEXT: and a6, a6, a3
; RV64IA-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-ZACAS-NEXT: .LBB50_3: # in Loop: Header=BB50_1 Depth=1
; RV64IA-ZACAS-NEXT: sc.w a6, a6, (a2)
@@ -6759,24 +7000,24 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_min_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: li a4, 255
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: li a3, 255
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 24
; RV32IA-WMO-NEXT: srai a1, a1, 24
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: xori a3, a3, 24
+; RV32IA-WMO-NEXT: xori a4, a0, 24
; RV32IA-WMO-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1
; RV32IA-WMO-NEXT: lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT: and a7, a5, a4
+; RV32IA-WMO-NEXT: and a7, a5, a3
; RV32IA-WMO-NEXT: mv a6, a5
-; RV32IA-WMO-NEXT: sll a7, a7, a3
-; RV32IA-WMO-NEXT: sra a7, a7, a3
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a1, a7, .LBB51_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1
; RV32IA-WMO-NEXT: xor a6, a5, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
+; RV32IA-WMO-NEXT: and a6, a6, a3
; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1
; RV32IA-WMO-NEXT: sc.w a6, a6, (a2)
@@ -6788,24 +7029,24 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_min_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: li a4, 255
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: li a3, 255
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 24
; RV32IA-TSO-NEXT: srai a1, a1, 24
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: xori a3, a3, 24
+; RV32IA-TSO-NEXT: xori a4, a0, 24
; RV32IA-TSO-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1
; RV32IA-TSO-NEXT: lr.w a5, (a2)
-; RV32IA-TSO-NEXT: and a7, a5, a4
+; RV32IA-TSO-NEXT: and a7, a5, a3
; RV32IA-TSO-NEXT: mv a6, a5
-; RV32IA-TSO-NEXT: sll a7, a7, a3
-; RV32IA-TSO-NEXT: sra a7, a7, a3
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a1, a7, .LBB51_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1
; RV32IA-TSO-NEXT: xor a6, a5, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
+; RV32IA-TSO-NEXT: and a6, a6, a3
; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
@@ -6859,24 +7100,24 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-NOZACAS-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB51_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w a6, a6, (a2)
@@ -6888,24 +7129,24 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-NOZACAS-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB51_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
@@ -6917,24 +7158,24 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: li a4, 255
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: li a3, 255
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-ZACAS-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB51_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w a6, a6, (a2)
@@ -6946,24 +7187,24 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: li a4, 255
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: li a3, 255
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-ZACAS-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB51_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
@@ -7031,24 +7272,24 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_min_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: li a4, 255
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: li a3, 255
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 24
; RV32IA-WMO-NEXT: srai a1, a1, 24
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: xori a3, a3, 24
+; RV32IA-WMO-NEXT: xori a4, a0, 24
; RV32IA-WMO-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1
; RV32IA-WMO-NEXT: lr.w a5, (a2)
-; RV32IA-WMO-NEXT: and a7, a5, a4
+; RV32IA-WMO-NEXT: and a7, a5, a3
; RV32IA-WMO-NEXT: mv a6, a5
-; RV32IA-WMO-NEXT: sll a7, a7, a3
-; RV32IA-WMO-NEXT: sra a7, a7, a3
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a1, a7, .LBB52_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1
; RV32IA-WMO-NEXT: xor a6, a5, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
+; RV32IA-WMO-NEXT: and a6, a6, a3
; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1
; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2)
@@ -7060,24 +7301,24 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_min_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: li a4, 255
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: li a3, 255
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 24
; RV32IA-TSO-NEXT: srai a1, a1, 24
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: xori a3, a3, 24
+; RV32IA-TSO-NEXT: xori a4, a0, 24
; RV32IA-TSO-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1
; RV32IA-TSO-NEXT: lr.w a5, (a2)
-; RV32IA-TSO-NEXT: and a7, a5, a4
+; RV32IA-TSO-NEXT: and a7, a5, a3
; RV32IA-TSO-NEXT: mv a6, a5
-; RV32IA-TSO-NEXT: sll a7, a7, a3
-; RV32IA-TSO-NEXT: sra a7, a7, a3
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a1, a7, .LBB52_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1
; RV32IA-TSO-NEXT: xor a6, a5, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
+; RV32IA-TSO-NEXT: and a6, a6, a3
; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
@@ -7131,24 +7372,24 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-NOZACAS-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-NOZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB52_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -7160,24 +7401,24 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-NOZACAS-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB52_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
@@ -7189,24 +7430,24 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: li a4, 255
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: li a3, 255
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-ZACAS-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-ZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB52_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -7218,24 +7459,24 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: li a4, 255
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: li a3, 255
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-ZACAS-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB52_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
@@ -7303,24 +7544,24 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_min_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: li a4, 255
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: li a3, 255
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 24
; RV32IA-WMO-NEXT: srai a1, a1, 24
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: xori a3, a3, 24
+; RV32IA-WMO-NEXT: xori a4, a0, 24
; RV32IA-WMO-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1
; RV32IA-WMO-NEXT: lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT: and a7, a5, a4
+; RV32IA-WMO-NEXT: and a7, a5, a3
; RV32IA-WMO-NEXT: mv a6, a5
-; RV32IA-WMO-NEXT: sll a7, a7, a3
-; RV32IA-WMO-NEXT: sra a7, a7, a3
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a1, a7, .LBB53_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1
; RV32IA-WMO-NEXT: xor a6, a5, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
+; RV32IA-WMO-NEXT: and a6, a6, a3
; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1
; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2)
@@ -7332,24 +7573,24 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_min_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: li a4, 255
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: li a3, 255
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 24
; RV32IA-TSO-NEXT: srai a1, a1, 24
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: xori a3, a3, 24
+; RV32IA-TSO-NEXT: xori a4, a0, 24
; RV32IA-TSO-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1
; RV32IA-TSO-NEXT: lr.w a5, (a2)
-; RV32IA-TSO-NEXT: and a7, a5, a4
+; RV32IA-TSO-NEXT: and a7, a5, a3
; RV32IA-TSO-NEXT: mv a6, a5
-; RV32IA-TSO-NEXT: sll a7, a7, a3
-; RV32IA-TSO-NEXT: sra a7, a7, a3
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a1, a7, .LBB53_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1
; RV32IA-TSO-NEXT: xor a6, a5, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
+; RV32IA-TSO-NEXT: and a6, a6, a3
; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
@@ -7403,24 +7644,24 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-NOZACAS-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB53_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -7432,24 +7673,24 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-NOZACAS-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB53_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
@@ -7461,24 +7702,24 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: li a4, 255
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: li a3, 255
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-WMO-ZACAS-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1
; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB53_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -7490,24 +7731,24 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: li a4, 255
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: li a3, 255
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-TSO-ZACAS-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1
; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB53_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
@@ -7575,24 +7816,24 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_min_i8_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: li a4, 255
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: li a3, 255
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 24
; RV32IA-NEXT: srai a1, a1, 24
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: xori a3, a3, 24
+; RV32IA-NEXT: xori a4, a0, 24
; RV32IA-NEXT: .LBB54_1: # =>This Inner Loop Header: Depth=1
; RV32IA-NEXT: lr.w.aqrl a5, (a2)
-; RV32IA-NEXT: and a7, a5, a4
+; RV32IA-NEXT: and a7, a5, a3
; RV32IA-NEXT: mv a6, a5
-; RV32IA-NEXT: sll a7, a7, a3
-; RV32IA-NEXT: sra a7, a7, a3
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a1, a7, .LBB54_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB54_1 Depth=1
; RV32IA-NEXT: xor a6, a5, a1
-; RV32IA-NEXT: and a6, a6, a4
+; RV32IA-NEXT: and a6, a6, a3
; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB54_3: # in Loop: Header=BB54_1 Depth=1
; RV32IA-NEXT: sc.w.rl a6, a6, (a2)
@@ -7646,24 +7887,24 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_min_i8_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT: li a4, 255
-; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT: li a3, 255
+; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-NOZACAS-NEXT: slli a1, a1, 56
; RV64IA-NOZACAS-NEXT: srai a1, a1, 56
; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT: xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT: xori a4, a0, 56
; RV64IA-NOZACAS-NEXT: .LBB54_1: # =>This Inner Loop Header: Depth=1
; RV64IA-NOZACAS-NEXT: lr.w.aqrl a5, (a2)
-; RV64IA-NOZACAS-NEXT: and a7, a5, a4
+; RV64IA-NOZACAS-NEXT: and a7, a5, a3
; RV64IA-NOZACAS-NEXT: mv a6, a5
-; RV64IA-NOZACAS-NEXT: sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT: sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-NOZACAS-NEXT: bge a1, a7, .LBB54_3
; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB54_1 Depth=1
; RV64IA-NOZACAS-NEXT: xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT: and a6, a6, a4
+; RV64IA-NOZACAS-NEXT: and a6, a6, a3
; RV64IA-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-NOZACAS-NEXT: .LBB54_3: # in Loop: Header=BB54_1 Depth=1
; RV64IA-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -7675,24 +7916,24 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_min_i8_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-ZACAS-NEXT: li a4, 255
-; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT: li a3, 255
+; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-ZACAS-NEXT: slli a1, a1, 56
; RV64IA-ZACAS-NEXT: srai a1, a1, 56
; RV64IA-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT: xori a3, a3, 56
+; RV64IA-ZACAS-NEXT: xori a4, a0, 56
; RV64IA-ZACAS-NEXT: .LBB54_1: # =>This Inner Loop Header: Depth=1
; RV64IA-ZACAS-NEXT: lr.w.aqrl a5, (a2)
-; RV64IA-ZACAS-NEXT: and a7, a5, a4
+; RV64IA-ZACAS-NEXT: and a7, a5, a3
; RV64IA-ZACAS-NEXT: mv a6, a5
-; RV64IA-ZACAS-NEXT: sll a7, a7, a3
-; RV64IA-ZACAS-NEXT: sra a7, a7, a3
+; RV64IA-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-ZACAS-NEXT: bge a1, a7, .LBB54_3
; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB54_1 Depth=1
; RV64IA-ZACAS-NEXT: xor a6, a5, a1
-; RV64IA-ZACAS-NEXT: and a6, a6, a4
+; RV64IA-ZACAS-NEXT: and a6, a6, a3
; RV64IA-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-ZACAS-NEXT: .LBB54_3: # in Loop: Header=BB54_1 Depth=1
; RV64IA-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
@@ -7758,6 +7999,7 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umax_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -7822,6 +8064,7 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_umax_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -7846,6 +8089,7 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_umax_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -7924,6 +8168,7 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umax_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -7948,6 +8193,7 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umax_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -8012,6 +8258,7 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -8036,6 +8283,7 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umax_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -8060,6 +8308,7 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umax_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -8084,6 +8333,7 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umax_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -8162,6 +8412,7 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umax_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -8186,6 +8437,7 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umax_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -8250,6 +8502,7 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -8274,6 +8527,7 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umax_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -8298,6 +8552,7 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umax_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -8322,6 +8577,7 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umax_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -8400,6 +8656,7 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umax_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -8424,6 +8681,7 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umax_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -8488,6 +8746,7 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -8512,6 +8771,7 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umax_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -8536,6 +8796,7 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umax_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -8560,6 +8821,7 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umax_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -8638,6 +8900,7 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umax_i8_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -8702,6 +8965,7 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_umax_i8_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -8726,6 +8990,7 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_umax_i8_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -8804,6 +9069,7 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umin_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -8868,6 +9134,7 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_umin_i8_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -8892,6 +9159,7 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_umin_i8_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -8970,6 +9238,7 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umin_i8_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -8994,6 +9263,7 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umin_i8_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -9058,6 +9328,7 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i8_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -9082,6 +9353,7 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umin_i8_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -9106,6 +9378,7 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umin_i8_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -9130,6 +9403,7 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umin_i8_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -9208,6 +9482,7 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umin_i8_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -9232,6 +9507,7 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umin_i8_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -9296,6 +9572,7 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i8_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -9320,6 +9597,7 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umin_i8_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -9344,6 +9622,7 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umin_i8_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -9368,6 +9647,7 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umin_i8_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -9446,6 +9726,7 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umin_i8_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: li a3, 255
; RV32IA-WMO-NEXT: sll a3, a3, a0
@@ -9470,6 +9751,7 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umin_i8_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: li a3, 255
; RV32IA-TSO-NEXT: sll a3, a3, a0
@@ -9534,6 +9816,7 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i8_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: li a3, 255
; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -9558,6 +9841,7 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umin_i8_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: li a3, 255
; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
@@ -9582,6 +9866,7 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umin_i8_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: li a3, 255
; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
@@ -9606,6 +9891,7 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umin_i8_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: li a3, 255
; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
@@ -9684,6 +9970,7 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umin_i8_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -9748,6 +10035,7 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_umin_i8_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: li a3, 255
; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
@@ -9772,6 +10060,7 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_umin_i8_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: li a3, 255
; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
@@ -9820,6 +10109,7 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_xchg_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -9851,6 +10141,7 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -9872,6 +10163,7 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_xchg_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -9917,6 +10209,7 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -9938,6 +10231,7 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -9969,6 +10263,7 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -9990,6 +10285,7 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -10011,6 +10307,7 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -10032,6 +10329,7 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -10077,6 +10375,7 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -10098,6 +10397,7 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -10129,6 +10429,7 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -10150,6 +10451,7 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -10171,6 +10473,7 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -10192,6 +10495,7 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -10237,6 +10541,7 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -10258,6 +10563,7 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -10289,6 +10595,7 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -10310,6 +10617,7 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -10331,6 +10639,7 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -10352,6 +10661,7 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -10397,6 +10707,7 @@ define i16 @atomicrmw_xchg_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_xchg_i16_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -10428,6 +10739,7 @@ define i16 @atomicrmw_xchg_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_i16_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -10449,6 +10761,7 @@ define i16 @atomicrmw_xchg_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_xchg_i16_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -10498,6 +10811,7 @@ define i16 @atomicrmw_xchg_0_i16_monotonic(ptr %a) nounwind {
; RV32IA-LABEL: atomicrmw_xchg_0_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a1, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a2, 16
; RV32IA-NEXT: addi a2, a2, -1
@@ -10521,6 +10835,7 @@ define i16 @atomicrmw_xchg_0_i16_monotonic(ptr %a) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_0_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a2, 16
; RV64IA-NOZACAS-NEXT: addi a2, a2, -1
@@ -10533,6 +10848,7 @@ define i16 @atomicrmw_xchg_0_i16_monotonic(ptr %a) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_xchg_0_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a2, 16
; RV64IA-ZACAS-NEXT: addi a2, a2, -1
@@ -10570,6 +10886,7 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a2, 16
; RV32IA-WMO-NEXT: addi a2, a2, -1
@@ -10582,6 +10899,7 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a2, 16
; RV32IA-TSO-NEXT: addi a2, a2, -1
@@ -10605,6 +10923,7 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a2, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a2, a2, -1
@@ -10617,6 +10936,7 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a2, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a2, a2, -1
@@ -10629,6 +10949,7 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_0_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a2, 16
; RV64IA-WMO-ZACAS-NEXT: addi a2, a2, -1
@@ -10641,6 +10962,7 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_0_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a2, 16
; RV64IA-TSO-ZACAS-NEXT: addi a2, a2, -1
@@ -10678,6 +11000,7 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a2, 16
; RV32IA-WMO-NEXT: addi a2, a2, -1
@@ -10690,6 +11013,7 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a2, 16
; RV32IA-TSO-NEXT: addi a2, a2, -1
@@ -10713,6 +11037,7 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a2, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a2, a2, -1
@@ -10725,6 +11050,7 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a2, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a2, a2, -1
@@ -10737,6 +11063,7 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_0_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a2, 16
; RV64IA-WMO-ZACAS-NEXT: addi a2, a2, -1
@@ -10749,6 +11076,7 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_0_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a2, 16
; RV64IA-TSO-ZACAS-NEXT: addi a2, a2, -1
@@ -10786,6 +11114,7 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a2, 16
; RV32IA-WMO-NEXT: addi a2, a2, -1
@@ -10798,6 +11127,7 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a2, 16
; RV32IA-TSO-NEXT: addi a2, a2, -1
@@ -10821,6 +11151,7 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a2, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a2, a2, -1
@@ -10833,6 +11164,7 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a2, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a2, a2, -1
@@ -10845,6 +11177,7 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_0_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a2, 16
; RV64IA-WMO-ZACAS-NEXT: addi a2, a2, -1
@@ -10857,6 +11190,7 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_0_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a2, 16
; RV64IA-TSO-ZACAS-NEXT: addi a2, a2, -1
@@ -10894,6 +11228,7 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i16_seq_cst:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a2, 16
; RV32IA-WMO-NEXT: addi a2, a2, -1
@@ -10906,6 +11241,7 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i16_seq_cst:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a2, 16
; RV32IA-TSO-NEXT: addi a2, a2, -1
@@ -10929,6 +11265,7 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_seq_cst:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a2, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a2, a2, -1
@@ -10941,6 +11278,7 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_seq_cst:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a2, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a2, a2, -1
@@ -10953,6 +11291,7 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_0_i16_seq_cst:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a2, 16
; RV64IA-WMO-ZACAS-NEXT: addi a2, a2, -1
@@ -10965,6 +11304,7 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_0_i16_seq_cst:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a2, 16
; RV64IA-TSO-ZACAS-NEXT: addi a2, a2, -1
@@ -11003,6 +11343,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_monotonic(ptr %a) nounwind {
; RV32IA-LABEL: atomicrmw_xchg_minus_1_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a1, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a2, 16
; RV32IA-NEXT: addi a2, a2, -1
@@ -11026,6 +11367,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_monotonic(ptr %a) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a2, 16
; RV64IA-NOZACAS-NEXT: addi a2, a2, -1
@@ -11037,6 +11379,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_monotonic(ptr %a) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_xchg_minus_1_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a2, 16
; RV64IA-ZACAS-NEXT: addi a2, a2, -1
@@ -11076,6 +11419,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a2, 16
; RV32IA-WMO-NEXT: addi a2, a2, -1
@@ -11087,6 +11431,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a2, 16
; RV32IA-TSO-NEXT: addi a2, a2, -1
@@ -11110,6 +11455,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a2, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a2, a2, -1
@@ -11121,6 +11467,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a2, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a2, a2, -1
@@ -11132,6 +11479,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a2, 16
; RV64IA-WMO-ZACAS-NEXT: addi a2, a2, -1
@@ -11143,6 +11491,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a2, 16
; RV64IA-TSO-ZACAS-NEXT: addi a2, a2, -1
@@ -11182,6 +11531,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a2, 16
; RV32IA-WMO-NEXT: addi a2, a2, -1
@@ -11193,6 +11543,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a2, 16
; RV32IA-TSO-NEXT: addi a2, a2, -1
@@ -11216,6 +11567,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a2, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a2, a2, -1
@@ -11227,6 +11579,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a2, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a2, a2, -1
@@ -11238,6 +11591,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a2, 16
; RV64IA-WMO-ZACAS-NEXT: addi a2, a2, -1
@@ -11249,6 +11603,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a2, 16
; RV64IA-TSO-ZACAS-NEXT: addi a2, a2, -1
@@ -11288,6 +11643,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a2, 16
; RV32IA-WMO-NEXT: addi a2, a2, -1
@@ -11299,6 +11655,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a2, 16
; RV32IA-TSO-NEXT: addi a2, a2, -1
@@ -11322,6 +11679,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a2, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a2, a2, -1
@@ -11333,6 +11691,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a2, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a2, a2, -1
@@ -11344,6 +11703,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a2, 16
; RV64IA-WMO-ZACAS-NEXT: addi a2, a2, -1
@@ -11355,6 +11715,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a2, 16
; RV64IA-TSO-ZACAS-NEXT: addi a2, a2, -1
@@ -11394,6 +11755,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a1, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a2, 16
; RV32IA-WMO-NEXT: addi a2, a2, -1
@@ -11405,6 +11767,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a1, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a2, 16
; RV32IA-TSO-NEXT: addi a2, a2, -1
@@ -11428,6 +11791,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a2, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a2, a2, -1
@@ -11439,6 +11803,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a2, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a2, a2, -1
@@ -11450,6 +11815,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a2, 16
; RV64IA-WMO-ZACAS-NEXT: addi a2, a2, -1
@@ -11461,6 +11827,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a1, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a2, 16
; RV64IA-TSO-ZACAS-NEXT: addi a2, a2, -1
@@ -11498,6 +11865,7 @@ define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_add_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -11529,6 +11897,7 @@ define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_add_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -11550,6 +11919,7 @@ define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_add_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -11595,6 +11965,7 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_add_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -11616,6 +11987,7 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_add_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -11647,6 +12019,7 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -11668,6 +12041,7 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_add_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -11689,6 +12063,7 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_add_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -11710,6 +12085,7 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_add_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -11755,6 +12131,7 @@ define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_add_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -11776,6 +12153,7 @@ define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_add_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -11807,6 +12185,7 @@ define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -11828,6 +12207,7 @@ define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_add_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -11849,6 +12229,7 @@ define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_add_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -11870,6 +12251,7 @@ define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_add_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -11915,6 +12297,7 @@ define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_add_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -11936,6 +12319,7 @@ define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_add_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -11967,6 +12351,7 @@ define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -11988,6 +12373,7 @@ define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_add_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -12009,6 +12395,7 @@ define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_add_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -12030,6 +12417,7 @@ define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_add_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -12075,6 +12463,7 @@ define i16 @atomicrmw_add_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_add_i16_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -12106,6 +12495,7 @@ define i16 @atomicrmw_add_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_add_i16_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -12127,6 +12517,7 @@ define i16 @atomicrmw_add_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_add_i16_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -12172,6 +12563,7 @@ define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_sub_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -12203,6 +12595,7 @@ define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_sub_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -12224,6 +12617,7 @@ define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_sub_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -12271,6 +12665,7 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_sub_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -12292,6 +12687,7 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_sub_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -12323,6 +12719,7 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -12344,6 +12741,7 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_sub_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -12365,6 +12763,7 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_sub_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -12386,6 +12785,7 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_sub_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -12433,6 +12833,7 @@ define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_sub_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -12454,6 +12855,7 @@ define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_sub_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -12485,6 +12887,7 @@ define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -12506,6 +12909,7 @@ define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_sub_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -12527,6 +12931,7 @@ define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_sub_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -12548,6 +12953,7 @@ define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_sub_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -12595,6 +13001,7 @@ define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_sub_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -12616,6 +13023,7 @@ define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_sub_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -12647,6 +13055,7 @@ define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -12668,6 +13077,7 @@ define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_sub_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -12689,6 +13099,7 @@ define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_sub_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -12710,6 +13121,7 @@ define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_sub_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -12757,6 +13169,7 @@ define i16 @atomicrmw_sub_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_sub_i16_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -12788,6 +13201,7 @@ define i16 @atomicrmw_sub_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_sub_i16_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -12809,6 +13223,7 @@ define i16 @atomicrmw_sub_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_sub_i16_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -12856,6 +13271,7 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_and_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -12881,6 +13297,7 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_and_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -12896,6 +13313,7 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_and_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -12935,6 +13353,7 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_and_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -12950,6 +13369,7 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_and_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -12975,6 +13395,7 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -12990,6 +13411,7 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_and_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13005,6 +13427,7 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_and_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -13020,6 +13443,7 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_and_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -13059,6 +13483,7 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_and_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -13074,6 +13499,7 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_and_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -13099,6 +13525,7 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13114,6 +13541,7 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_and_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13129,6 +13557,7 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_and_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -13144,6 +13573,7 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_and_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -13183,6 +13613,7 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_and_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -13198,6 +13629,7 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_and_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -13223,6 +13655,7 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13238,6 +13671,7 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_and_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13253,6 +13687,7 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_and_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -13268,6 +13703,7 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_and_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -13307,6 +13743,7 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_and_i16_seq_cst:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -13322,6 +13759,7 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_and_i16_seq_cst:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -13347,6 +13785,7 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_seq_cst:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13362,6 +13801,7 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_and_i16_seq_cst:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13377,6 +13817,7 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_and_i16_seq_cst:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -13392,6 +13833,7 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_and_i16_seq_cst:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -13431,6 +13873,7 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_nand_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -13463,6 +13906,7 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -13485,6 +13929,7 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_nand_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -13507,6 +13952,7 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i16_monotonic:
; RV64IA-WMO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: addi a3, a3, -1
@@ -13529,6 +13975,7 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i16_monotonic:
; RV64IA-TSO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: addi a3, a3, -1
@@ -13595,6 +14042,7 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_nand_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -13617,6 +14065,7 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_nand_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -13649,6 +14098,7 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13671,6 +14121,7 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_nand_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13693,6 +14144,7 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_nand_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -13715,6 +14167,7 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_nand_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -13737,6 +14190,7 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i16_acquire:
; RV64IA-WMO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: addi a3, a3, -1
@@ -13759,6 +14213,7 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i16_acquire:
; RV64IA-TSO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: addi a3, a3, -1
@@ -13825,6 +14280,7 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_nand_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -13847,6 +14303,7 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_nand_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -13879,6 +14336,7 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13901,6 +14359,7 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_nand_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -13923,6 +14382,7 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_nand_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -13945,6 +14405,7 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_nand_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -13967,6 +14428,7 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i16_release:
; RV64IA-WMO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: addi a3, a3, -1
@@ -13989,6 +14451,7 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i16_release:
; RV64IA-TSO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: addi a3, a3, -1
@@ -14055,6 +14518,7 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_nand_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -14077,6 +14541,7 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_nand_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -14109,6 +14574,7 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -14131,6 +14597,7 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_nand_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -14153,6 +14620,7 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_nand_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -14175,6 +14643,7 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_nand_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -14197,6 +14666,7 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i16_acq_rel:
; RV64IA-WMO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: addi a3, a3, -1
@@ -14219,6 +14689,7 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i16_acq_rel:
; RV64IA-TSO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: addi a3, a3, -1
@@ -14285,6 +14756,7 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_nand_i16_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -14317,6 +14789,7 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i16_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -14339,6 +14812,7 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_nand_i16_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -14361,6 +14835,7 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i16_seq_cst:
; RV64IA-WMO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZABHA-NOZACAS-NEXT: addi a3, a3, -1
@@ -14383,6 +14858,7 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZABHA-NOZACAS-LABEL: atomicrmw_nand_i16_seq_cst:
; RV64IA-TSO-ZABHA-NOZACAS: # %bb.0:
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZABHA-NOZACAS-NEXT: addi a3, a3, -1
@@ -14449,6 +14925,7 @@ define i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_or_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: slli a1, a1, 16
; RV32IA-NEXT: srli a1, a1, 16
@@ -14470,6 +14947,7 @@ define i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_or_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-NOZACAS-NEXT: srli a1, a1, 48
@@ -14481,6 +14959,7 @@ define i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_or_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-ZACAS-NEXT: srli a1, a1, 48
@@ -14516,6 +14995,7 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_or_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srli a1, a1, 16
@@ -14527,6 +15007,7 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_or_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srli a1, a1, 16
@@ -14548,6 +15029,7 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srli a1, a1, 48
@@ -14559,6 +15041,7 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_or_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srli a1, a1, 48
@@ -14570,6 +15053,7 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_or_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srli a1, a1, 48
@@ -14581,6 +15065,7 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_or_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srli a1, a1, 48
@@ -14616,6 +15101,7 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_or_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srli a1, a1, 16
@@ -14627,6 +15113,7 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_or_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srli a1, a1, 16
@@ -14648,6 +15135,7 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srli a1, a1, 48
@@ -14659,6 +15147,7 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_or_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srli a1, a1, 48
@@ -14670,6 +15159,7 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_or_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srli a1, a1, 48
@@ -14681,6 +15171,7 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_or_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srli a1, a1, 48
@@ -14716,6 +15207,7 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_or_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srli a1, a1, 16
@@ -14727,6 +15219,7 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_or_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srli a1, a1, 16
@@ -14748,6 +15241,7 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srli a1, a1, 48
@@ -14759,6 +15253,7 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_or_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srli a1, a1, 48
@@ -14770,6 +15265,7 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_or_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srli a1, a1, 48
@@ -14781,6 +15277,7 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_or_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srli a1, a1, 48
@@ -14816,6 +15313,7 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_or_i16_seq_cst:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srli a1, a1, 16
@@ -14827,6 +15325,7 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_or_i16_seq_cst:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srli a1, a1, 16
@@ -14848,6 +15347,7 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_seq_cst:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srli a1, a1, 48
@@ -14859,6 +15359,7 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_or_i16_seq_cst:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srli a1, a1, 48
@@ -14870,6 +15371,7 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_or_i16_seq_cst:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srli a1, a1, 48
@@ -14881,6 +15383,7 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_or_i16_seq_cst:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srli a1, a1, 48
@@ -14916,6 +15419,7 @@ define i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_xor_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: slli a1, a1, 16
; RV32IA-NEXT: srli a1, a1, 16
@@ -14937,6 +15441,7 @@ define i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_xor_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-NOZACAS-NEXT: srli a1, a1, 48
@@ -14948,6 +15453,7 @@ define i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_xor_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-ZACAS-NEXT: srli a1, a1, 48
@@ -14983,6 +15489,7 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xor_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srli a1, a1, 16
@@ -14994,6 +15501,7 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xor_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srli a1, a1, 16
@@ -15015,6 +15523,7 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srli a1, a1, 48
@@ -15026,6 +15535,7 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xor_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srli a1, a1, 48
@@ -15037,6 +15547,7 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xor_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srli a1, a1, 48
@@ -15048,6 +15559,7 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xor_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srli a1, a1, 48
@@ -15083,6 +15595,7 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xor_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srli a1, a1, 16
@@ -15094,6 +15607,7 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xor_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srli a1, a1, 16
@@ -15115,6 +15629,7 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srli a1, a1, 48
@@ -15126,6 +15641,7 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xor_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srli a1, a1, 48
@@ -15137,6 +15653,7 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xor_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srli a1, a1, 48
@@ -15148,6 +15665,7 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xor_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srli a1, a1, 48
@@ -15183,6 +15701,7 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xor_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srli a1, a1, 16
@@ -15194,6 +15713,7 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xor_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srli a1, a1, 16
@@ -15215,6 +15735,7 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srli a1, a1, 48
@@ -15226,6 +15747,7 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xor_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srli a1, a1, 48
@@ -15237,6 +15759,7 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xor_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srli a1, a1, 48
@@ -15248,6 +15771,7 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xor_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srli a1, a1, 48
@@ -15283,6 +15807,7 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_xor_i16_seq_cst:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srli a1, a1, 16
@@ -15294,6 +15819,7 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_xor_i16_seq_cst:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srli a1, a1, 16
@@ -15315,6 +15841,7 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_seq_cst:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srli a1, a1, 48
@@ -15326,6 +15853,7 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_xor_i16_seq_cst:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srli a1, a1, 48
@@ -15337,6 +15865,7 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_xor_i16_seq_cst:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srli a1, a1, 48
@@ -15348,6 +15877,7 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_xor_i16_seq_cst:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srli a1, a1, 48
@@ -15415,32 +15945,32 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_max_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: lui a4, 16
-; RV32IA-NEXT: addi a4, a4, -1
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: lui a3, 16
+; RV32IA-NEXT: addi a3, a3, -1
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 16
; RV32IA-NEXT: srai a1, a1, 16
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: li a5, 16
-; RV32IA-NEXT: sub a5, a5, a3
+; RV32IA-NEXT: li a4, 16
+; RV32IA-NEXT: sub a4, a4, a0
; RV32IA-NEXT: .LBB110_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT: lr.w a3, (a2)
-; RV32IA-NEXT: and a7, a3, a4
-; RV32IA-NEXT: mv a6, a3
-; RV32IA-NEXT: sll a7, a7, a5
-; RV32IA-NEXT: sra a7, a7, a5
+; RV32IA-NEXT: lr.w a5, (a2)
+; RV32IA-NEXT: and a7, a5, a3
+; RV32IA-NEXT: mv a6, a5
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a7, a1, .LBB110_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB110_1 Depth=1
-; RV32IA-NEXT: xor a6, a3, a1
-; RV32IA-NEXT: and a6, a6, a4
-; RV32IA-NEXT: xor a6, a3, a6
+; RV32IA-NEXT: xor a6, a5, a1
+; RV32IA-NEXT: and a6, a6, a3
+; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB110_3: # in Loop: Header=BB110_1 Depth=1
; RV32IA-NEXT: sc.w a6, a6, (a2)
; RV32IA-NEXT: bnez a6, .LBB110_1
; RV32IA-NEXT: # %bb.4:
-; RV32IA-NEXT: srl a0, a3, a0
+; RV32IA-NEXT: srl a0, a5, a0
; RV32IA-NEXT: ret
;
; RV64I-LABEL: atomicrmw_max_i16_monotonic:
@@ -15488,63 +16018,63 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_max_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT: lui a4, 16
-; RV64IA-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT: lui a3, 16
+; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT: li a5, 48
-; RV64IA-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT: li a4, 48
+; RV64IA-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-NOZACAS-NEXT: .LBB110_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-NOZACAS-NEXT: mv a6, a3
-; RV64IA-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-NOZACAS-NEXT: mv a6, a5
+; RV64IA-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-NOZACAS-NEXT: bge a7, a1, .LBB110_3
; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB110_1 Depth=1
-; RV64IA-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-NOZACAS-NEXT: .LBB110_3: # in Loop: Header=BB110_1 Depth=1
; RV64IA-NOZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-NOZACAS-NEXT: bnez a6, .LBB110_1
; RV64IA-NOZACAS-NEXT: # %bb.4:
-; RV64IA-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-NOZACAS-NEXT: ret
;
; RV64IA-ZACAS-LABEL: atomicrmw_max_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-ZACAS-NEXT: lui a4, 16
-; RV64IA-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT: lui a3, 16
+; RV64IA-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT: li a5, 48
-; RV64IA-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-ZACAS-NEXT: li a4, 48
+; RV64IA-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-ZACAS-NEXT: .LBB110_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-ZACAS-NEXT: mv a6, a3
-; RV64IA-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-ZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-ZACAS-NEXT: mv a6, a5
+; RV64IA-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-ZACAS-NEXT: bge a7, a1, .LBB110_3
; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB110_1 Depth=1
-; RV64IA-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-ZACAS-NEXT: .LBB110_3: # in Loop: Header=BB110_1 Depth=1
; RV64IA-ZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-ZACAS-NEXT: bnez a6, .LBB110_1
; RV64IA-ZACAS-NEXT: # %bb.4:
-; RV64IA-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-ZACAS-NEXT: ret
;
; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_monotonic:
@@ -15606,63 +16136,63 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_max_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: lui a4, 16
-; RV32IA-WMO-NEXT: addi a4, a4, -1
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: lui a3, 16
+; RV32IA-WMO-NEXT: addi a3, a3, -1
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srai a1, a1, 16
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: li a5, 16
-; RV32IA-WMO-NEXT: sub a5, a5, a3
+; RV32IA-WMO-NEXT: li a4, 16
+; RV32IA-WMO-NEXT: sub a4, a4, a0
; RV32IA-WMO-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT: lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT: and a7, a3, a4
-; RV32IA-WMO-NEXT: mv a6, a3
-; RV32IA-WMO-NEXT: sll a7, a7, a5
-; RV32IA-WMO-NEXT: sra a7, a7, a5
+; RV32IA-WMO-NEXT: lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT: and a7, a5, a3
+; RV32IA-WMO-NEXT: mv a6, a5
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a7, a1, .LBB111_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV32IA-WMO-NEXT: xor a6, a3, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
-; RV32IA-WMO-NEXT: xor a6, a3, a6
+; RV32IA-WMO-NEXT: xor a6, a5, a1
+; RV32IA-WMO-NEXT: and a6, a6, a3
+; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1
; RV32IA-WMO-NEXT: sc.w a6, a6, (a2)
; RV32IA-WMO-NEXT: bnez a6, .LBB111_1
; RV32IA-WMO-NEXT: # %bb.4:
-; RV32IA-WMO-NEXT: srl a0, a3, a0
+; RV32IA-WMO-NEXT: srl a0, a5, a0
; RV32IA-WMO-NEXT: ret
;
; RV32IA-TSO-LABEL: atomicrmw_max_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: lui a4, 16
-; RV32IA-TSO-NEXT: addi a4, a4, -1
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: lui a3, 16
+; RV32IA-TSO-NEXT: addi a3, a3, -1
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srai a1, a1, 16
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: li a5, 16
-; RV32IA-TSO-NEXT: sub a5, a5, a3
+; RV32IA-TSO-NEXT: li a4, 16
+; RV32IA-TSO-NEXT: sub a4, a4, a0
; RV32IA-TSO-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT: lr.w a3, (a2)
-; RV32IA-TSO-NEXT: and a7, a3, a4
-; RV32IA-TSO-NEXT: mv a6, a3
-; RV32IA-TSO-NEXT: sll a7, a7, a5
-; RV32IA-TSO-NEXT: sra a7, a7, a5
+; RV32IA-TSO-NEXT: lr.w a5, (a2)
+; RV32IA-TSO-NEXT: and a7, a5, a3
+; RV32IA-TSO-NEXT: mv a6, a5
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a7, a1, .LBB111_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV32IA-TSO-NEXT: xor a6, a3, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
-; RV32IA-TSO-NEXT: xor a6, a3, a6
+; RV32IA-TSO-NEXT: xor a6, a5, a1
+; RV32IA-TSO-NEXT: and a6, a6, a3
+; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
; RV32IA-TSO-NEXT: bnez a6, .LBB111_1
; RV32IA-TSO-NEXT: # %bb.4:
-; RV32IA-TSO-NEXT: srl a0, a3, a0
+; RV32IA-TSO-NEXT: srl a0, a5, a0
; RV32IA-TSO-NEXT: ret
;
; RV64I-LABEL: atomicrmw_max_i16_acquire:
@@ -15710,125 +16240,125 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-NOZACAS-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB111_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB111_1
; RV64IA-WMO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-NOZACAS-NEXT: ret
;
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-NOZACAS-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB111_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB111_1
; RV64IA-TSO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-NOZACAS-NEXT: ret
;
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: li a5, 48
-; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: li a4, 48
+; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-ZACAS-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB111_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB111_1
; RV64IA-WMO-ZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-ZACAS-NEXT: ret
;
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: li a5, 48
-; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: li a4, 48
+; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-ZACAS-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB111_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB111_1
; RV64IA-TSO-ZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-ZACAS-NEXT: ret
;
; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_acquire:
@@ -15890,63 +16420,63 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_max_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: lui a4, 16
-; RV32IA-WMO-NEXT: addi a4, a4, -1
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: lui a3, 16
+; RV32IA-WMO-NEXT: addi a3, a3, -1
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srai a1, a1, 16
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: li a5, 16
-; RV32IA-WMO-NEXT: sub a5, a5, a3
+; RV32IA-WMO-NEXT: li a4, 16
+; RV32IA-WMO-NEXT: sub a4, a4, a0
; RV32IA-WMO-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT: lr.w a3, (a2)
-; RV32IA-WMO-NEXT: and a7, a3, a4
-; RV32IA-WMO-NEXT: mv a6, a3
-; RV32IA-WMO-NEXT: sll a7, a7, a5
-; RV32IA-WMO-NEXT: sra a7, a7, a5
+; RV32IA-WMO-NEXT: lr.w a5, (a2)
+; RV32IA-WMO-NEXT: and a7, a5, a3
+; RV32IA-WMO-NEXT: mv a6, a5
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a7, a1, .LBB112_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV32IA-WMO-NEXT: xor a6, a3, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
-; RV32IA-WMO-NEXT: xor a6, a3, a6
+; RV32IA-WMO-NEXT: xor a6, a5, a1
+; RV32IA-WMO-NEXT: and a6, a6, a3
+; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1
; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2)
; RV32IA-WMO-NEXT: bnez a6, .LBB112_1
; RV32IA-WMO-NEXT: # %bb.4:
-; RV32IA-WMO-NEXT: srl a0, a3, a0
+; RV32IA-WMO-NEXT: srl a0, a5, a0
; RV32IA-WMO-NEXT: ret
;
; RV32IA-TSO-LABEL: atomicrmw_max_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: lui a4, 16
-; RV32IA-TSO-NEXT: addi a4, a4, -1
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: lui a3, 16
+; RV32IA-TSO-NEXT: addi a3, a3, -1
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srai a1, a1, 16
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: li a5, 16
-; RV32IA-TSO-NEXT: sub a5, a5, a3
+; RV32IA-TSO-NEXT: li a4, 16
+; RV32IA-TSO-NEXT: sub a4, a4, a0
; RV32IA-TSO-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT: lr.w a3, (a2)
-; RV32IA-TSO-NEXT: and a7, a3, a4
-; RV32IA-TSO-NEXT: mv a6, a3
-; RV32IA-TSO-NEXT: sll a7, a7, a5
-; RV32IA-TSO-NEXT: sra a7, a7, a5
+; RV32IA-TSO-NEXT: lr.w a5, (a2)
+; RV32IA-TSO-NEXT: and a7, a5, a3
+; RV32IA-TSO-NEXT: mv a6, a5
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a7, a1, .LBB112_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV32IA-TSO-NEXT: xor a6, a3, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
-; RV32IA-TSO-NEXT: xor a6, a3, a6
+; RV32IA-TSO-NEXT: xor a6, a5, a1
+; RV32IA-TSO-NEXT: and a6, a6, a3
+; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
; RV32IA-TSO-NEXT: bnez a6, .LBB112_1
; RV32IA-TSO-NEXT: # %bb.4:
-; RV32IA-TSO-NEXT: srl a0, a3, a0
+; RV32IA-TSO-NEXT: srl a0, a5, a0
; RV32IA-TSO-NEXT: ret
;
; RV64I-LABEL: atomicrmw_max_i16_release:
@@ -15994,125 +16524,125 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-NOZACAS-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB112_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB112_1
; RV64IA-WMO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-NOZACAS-NEXT: ret
;
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-NOZACAS-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB112_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB112_1
; RV64IA-TSO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-NOZACAS-NEXT: ret
;
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: li a5, 48
-; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: li a4, 48
+; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-ZACAS-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB112_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB112_1
; RV64IA-WMO-ZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-ZACAS-NEXT: ret
;
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: li a5, 48
-; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: li a4, 48
+; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-ZACAS-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB112_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB112_1
; RV64IA-TSO-ZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-ZACAS-NEXT: ret
;
; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_release:
@@ -16174,63 +16704,63 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_max_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: lui a4, 16
-; RV32IA-WMO-NEXT: addi a4, a4, -1
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: lui a3, 16
+; RV32IA-WMO-NEXT: addi a3, a3, -1
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srai a1, a1, 16
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: li a5, 16
-; RV32IA-WMO-NEXT: sub a5, a5, a3
+; RV32IA-WMO-NEXT: li a4, 16
+; RV32IA-WMO-NEXT: sub a4, a4, a0
; RV32IA-WMO-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT: lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT: and a7, a3, a4
-; RV32IA-WMO-NEXT: mv a6, a3
-; RV32IA-WMO-NEXT: sll a7, a7, a5
-; RV32IA-WMO-NEXT: sra a7, a7, a5
+; RV32IA-WMO-NEXT: lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT: and a7, a5, a3
+; RV32IA-WMO-NEXT: mv a6, a5
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a7, a1, .LBB113_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV32IA-WMO-NEXT: xor a6, a3, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
-; RV32IA-WMO-NEXT: xor a6, a3, a6
+; RV32IA-WMO-NEXT: xor a6, a5, a1
+; RV32IA-WMO-NEXT: and a6, a6, a3
+; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1
; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2)
; RV32IA-WMO-NEXT: bnez a6, .LBB113_1
; RV32IA-WMO-NEXT: # %bb.4:
-; RV32IA-WMO-NEXT: srl a0, a3, a0
+; RV32IA-WMO-NEXT: srl a0, a5, a0
; RV32IA-WMO-NEXT: ret
;
; RV32IA-TSO-LABEL: atomicrmw_max_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: lui a4, 16
-; RV32IA-TSO-NEXT: addi a4, a4, -1
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: lui a3, 16
+; RV32IA-TSO-NEXT: addi a3, a3, -1
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srai a1, a1, 16
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: li a5, 16
-; RV32IA-TSO-NEXT: sub a5, a5, a3
+; RV32IA-TSO-NEXT: li a4, 16
+; RV32IA-TSO-NEXT: sub a4, a4, a0
; RV32IA-TSO-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT: lr.w a3, (a2)
-; RV32IA-TSO-NEXT: and a7, a3, a4
-; RV32IA-TSO-NEXT: mv a6, a3
-; RV32IA-TSO-NEXT: sll a7, a7, a5
-; RV32IA-TSO-NEXT: sra a7, a7, a5
+; RV32IA-TSO-NEXT: lr.w a5, (a2)
+; RV32IA-TSO-NEXT: and a7, a5, a3
+; RV32IA-TSO-NEXT: mv a6, a5
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a7, a1, .LBB113_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV32IA-TSO-NEXT: xor a6, a3, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
-; RV32IA-TSO-NEXT: xor a6, a3, a6
+; RV32IA-TSO-NEXT: xor a6, a5, a1
+; RV32IA-TSO-NEXT: and a6, a6, a3
+; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
; RV32IA-TSO-NEXT: bnez a6, .LBB113_1
; RV32IA-TSO-NEXT: # %bb.4:
-; RV32IA-TSO-NEXT: srl a0, a3, a0
+; RV32IA-TSO-NEXT: srl a0, a5, a0
; RV32IA-TSO-NEXT: ret
;
; RV64I-LABEL: atomicrmw_max_i16_acq_rel:
@@ -16278,125 +16808,125 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-NOZACAS-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB113_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB113_1
; RV64IA-WMO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-NOZACAS-NEXT: ret
;
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-NOZACAS-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB113_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB113_1
; RV64IA-TSO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-NOZACAS-NEXT: ret
;
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: li a5, 48
-; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: li a4, 48
+; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-ZACAS-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB113_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB113_1
; RV64IA-WMO-ZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-ZACAS-NEXT: ret
;
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: li a5, 48
-; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: li a4, 48
+; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-ZACAS-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB113_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB113_1
; RV64IA-TSO-ZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-ZACAS-NEXT: ret
;
; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_acq_rel:
@@ -16458,32 +16988,32 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_max_i16_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: lui a4, 16
-; RV32IA-NEXT: addi a4, a4, -1
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: lui a3, 16
+; RV32IA-NEXT: addi a3, a3, -1
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 16
; RV32IA-NEXT: srai a1, a1, 16
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: li a5, 16
-; RV32IA-NEXT: sub a5, a5, a3
+; RV32IA-NEXT: li a4, 16
+; RV32IA-NEXT: sub a4, a4, a0
; RV32IA-NEXT: .LBB114_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT: lr.w.aqrl a3, (a2)
-; RV32IA-NEXT: and a7, a3, a4
-; RV32IA-NEXT: mv a6, a3
-; RV32IA-NEXT: sll a7, a7, a5
-; RV32IA-NEXT: sra a7, a7, a5
+; RV32IA-NEXT: lr.w.aqrl a5, (a2)
+; RV32IA-NEXT: and a7, a5, a3
+; RV32IA-NEXT: mv a6, a5
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a7, a1, .LBB114_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB114_1 Depth=1
-; RV32IA-NEXT: xor a6, a3, a1
-; RV32IA-NEXT: and a6, a6, a4
-; RV32IA-NEXT: xor a6, a3, a6
+; RV32IA-NEXT: xor a6, a5, a1
+; RV32IA-NEXT: and a6, a6, a3
+; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB114_3: # in Loop: Header=BB114_1 Depth=1
; RV32IA-NEXT: sc.w.rl a6, a6, (a2)
; RV32IA-NEXT: bnez a6, .LBB114_1
; RV32IA-NEXT: # %bb.4:
-; RV32IA-NEXT: srl a0, a3, a0
+; RV32IA-NEXT: srl a0, a5, a0
; RV32IA-NEXT: ret
;
; RV64I-LABEL: atomicrmw_max_i16_seq_cst:
@@ -16531,63 +17061,63 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_max_i16_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT: lui a4, 16
-; RV64IA-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT: lui a3, 16
+; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT: li a5, 48
-; RV64IA-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT: li a4, 48
+; RV64IA-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-NOZACAS-NEXT: .LBB114_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT: lr.w.aqrl a3, (a2)
-; RV64IA-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-NOZACAS-NEXT: mv a6, a3
-; RV64IA-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT: lr.w.aqrl a5, (a2)
+; RV64IA-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-NOZACAS-NEXT: mv a6, a5
+; RV64IA-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-NOZACAS-NEXT: bge a7, a1, .LBB114_3
; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB114_1 Depth=1
-; RV64IA-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-NOZACAS-NEXT: .LBB114_3: # in Loop: Header=BB114_1 Depth=1
; RV64IA-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-NOZACAS-NEXT: bnez a6, .LBB114_1
; RV64IA-NOZACAS-NEXT: # %bb.4:
-; RV64IA-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-NOZACAS-NEXT: ret
;
; RV64IA-ZACAS-LABEL: atomicrmw_max_i16_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-ZACAS-NEXT: lui a4, 16
-; RV64IA-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT: lui a3, 16
+; RV64IA-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT: li a5, 48
-; RV64IA-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-ZACAS-NEXT: li a4, 48
+; RV64IA-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-ZACAS-NEXT: .LBB114_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT: lr.w.aqrl a3, (a2)
-; RV64IA-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-ZACAS-NEXT: mv a6, a3
-; RV64IA-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-ZACAS-NEXT: lr.w.aqrl a5, (a2)
+; RV64IA-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-ZACAS-NEXT: mv a6, a5
+; RV64IA-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-ZACAS-NEXT: bge a7, a1, .LBB114_3
; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB114_1 Depth=1
-; RV64IA-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-ZACAS-NEXT: .LBB114_3: # in Loop: Header=BB114_1 Depth=1
; RV64IA-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-ZACAS-NEXT: bnez a6, .LBB114_1
; RV64IA-ZACAS-NEXT: # %bb.4:
-; RV64IA-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-ZACAS-NEXT: ret
;
; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_seq_cst:
@@ -16649,32 +17179,32 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_min_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: lui a4, 16
-; RV32IA-NEXT: addi a4, a4, -1
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: lui a3, 16
+; RV32IA-NEXT: addi a3, a3, -1
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 16
; RV32IA-NEXT: srai a1, a1, 16
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: li a5, 16
-; RV32IA-NEXT: sub a5, a5, a3
+; RV32IA-NEXT: li a4, 16
+; RV32IA-NEXT: sub a4, a4, a0
; RV32IA-NEXT: .LBB115_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT: lr.w a3, (a2)
-; RV32IA-NEXT: and a7, a3, a4
-; RV32IA-NEXT: mv a6, a3
-; RV32IA-NEXT: sll a7, a7, a5
-; RV32IA-NEXT: sra a7, a7, a5
+; RV32IA-NEXT: lr.w a5, (a2)
+; RV32IA-NEXT: and a7, a5, a3
+; RV32IA-NEXT: mv a6, a5
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a1, a7, .LBB115_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB115_1 Depth=1
-; RV32IA-NEXT: xor a6, a3, a1
-; RV32IA-NEXT: and a6, a6, a4
-; RV32IA-NEXT: xor a6, a3, a6
+; RV32IA-NEXT: xor a6, a5, a1
+; RV32IA-NEXT: and a6, a6, a3
+; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB115_3: # in Loop: Header=BB115_1 Depth=1
; RV32IA-NEXT: sc.w a6, a6, (a2)
; RV32IA-NEXT: bnez a6, .LBB115_1
; RV32IA-NEXT: # %bb.4:
-; RV32IA-NEXT: srl a0, a3, a0
+; RV32IA-NEXT: srl a0, a5, a0
; RV32IA-NEXT: ret
;
; RV64I-LABEL: atomicrmw_min_i16_monotonic:
@@ -16722,63 +17252,63 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_min_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT: lui a4, 16
-; RV64IA-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT: lui a3, 16
+; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT: li a5, 48
-; RV64IA-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT: li a4, 48
+; RV64IA-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-NOZACAS-NEXT: .LBB115_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-NOZACAS-NEXT: mv a6, a3
-; RV64IA-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-NOZACAS-NEXT: mv a6, a5
+; RV64IA-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-NOZACAS-NEXT: bge a1, a7, .LBB115_3
; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB115_1 Depth=1
-; RV64IA-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-NOZACAS-NEXT: .LBB115_3: # in Loop: Header=BB115_1 Depth=1
; RV64IA-NOZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-NOZACAS-NEXT: bnez a6, .LBB115_1
; RV64IA-NOZACAS-NEXT: # %bb.4:
-; RV64IA-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-NOZACAS-NEXT: ret
;
; RV64IA-ZACAS-LABEL: atomicrmw_min_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-ZACAS-NEXT: lui a4, 16
-; RV64IA-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT: lui a3, 16
+; RV64IA-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT: li a5, 48
-; RV64IA-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-ZACAS-NEXT: li a4, 48
+; RV64IA-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-ZACAS-NEXT: .LBB115_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-ZACAS-NEXT: mv a6, a3
-; RV64IA-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-ZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-ZACAS-NEXT: mv a6, a5
+; RV64IA-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-ZACAS-NEXT: bge a1, a7, .LBB115_3
; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB115_1 Depth=1
-; RV64IA-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-ZACAS-NEXT: .LBB115_3: # in Loop: Header=BB115_1 Depth=1
; RV64IA-ZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-ZACAS-NEXT: bnez a6, .LBB115_1
; RV64IA-ZACAS-NEXT: # %bb.4:
-; RV64IA-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-ZACAS-NEXT: ret
;
; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_monotonic:
@@ -16840,63 +17370,63 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_min_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: lui a4, 16
-; RV32IA-WMO-NEXT: addi a4, a4, -1
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: lui a3, 16
+; RV32IA-WMO-NEXT: addi a3, a3, -1
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srai a1, a1, 16
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: li a5, 16
-; RV32IA-WMO-NEXT: sub a5, a5, a3
+; RV32IA-WMO-NEXT: li a4, 16
+; RV32IA-WMO-NEXT: sub a4, a4, a0
; RV32IA-WMO-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT: lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT: and a7, a3, a4
-; RV32IA-WMO-NEXT: mv a6, a3
-; RV32IA-WMO-NEXT: sll a7, a7, a5
-; RV32IA-WMO-NEXT: sra a7, a7, a5
+; RV32IA-WMO-NEXT: lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT: and a7, a5, a3
+; RV32IA-WMO-NEXT: mv a6, a5
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a1, a7, .LBB116_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV32IA-WMO-NEXT: xor a6, a3, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
-; RV32IA-WMO-NEXT: xor a6, a3, a6
+; RV32IA-WMO-NEXT: xor a6, a5, a1
+; RV32IA-WMO-NEXT: and a6, a6, a3
+; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1
; RV32IA-WMO-NEXT: sc.w a6, a6, (a2)
; RV32IA-WMO-NEXT: bnez a6, .LBB116_1
; RV32IA-WMO-NEXT: # %bb.4:
-; RV32IA-WMO-NEXT: srl a0, a3, a0
+; RV32IA-WMO-NEXT: srl a0, a5, a0
; RV32IA-WMO-NEXT: ret
;
; RV32IA-TSO-LABEL: atomicrmw_min_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: lui a4, 16
-; RV32IA-TSO-NEXT: addi a4, a4, -1
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: lui a3, 16
+; RV32IA-TSO-NEXT: addi a3, a3, -1
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srai a1, a1, 16
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: li a5, 16
-; RV32IA-TSO-NEXT: sub a5, a5, a3
+; RV32IA-TSO-NEXT: li a4, 16
+; RV32IA-TSO-NEXT: sub a4, a4, a0
; RV32IA-TSO-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT: lr.w a3, (a2)
-; RV32IA-TSO-NEXT: and a7, a3, a4
-; RV32IA-TSO-NEXT: mv a6, a3
-; RV32IA-TSO-NEXT: sll a7, a7, a5
-; RV32IA-TSO-NEXT: sra a7, a7, a5
+; RV32IA-TSO-NEXT: lr.w a5, (a2)
+; RV32IA-TSO-NEXT: and a7, a5, a3
+; RV32IA-TSO-NEXT: mv a6, a5
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a1, a7, .LBB116_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV32IA-TSO-NEXT: xor a6, a3, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
-; RV32IA-TSO-NEXT: xor a6, a3, a6
+; RV32IA-TSO-NEXT: xor a6, a5, a1
+; RV32IA-TSO-NEXT: and a6, a6, a3
+; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
; RV32IA-TSO-NEXT: bnez a6, .LBB116_1
; RV32IA-TSO-NEXT: # %bb.4:
-; RV32IA-TSO-NEXT: srl a0, a3, a0
+; RV32IA-TSO-NEXT: srl a0, a5, a0
; RV32IA-TSO-NEXT: ret
;
; RV64I-LABEL: atomicrmw_min_i16_acquire:
@@ -16944,125 +17474,125 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-NOZACAS-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB116_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB116_1
; RV64IA-WMO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-NOZACAS-NEXT: ret
;
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-NOZACAS-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB116_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB116_1
; RV64IA-TSO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-NOZACAS-NEXT: ret
;
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: li a5, 48
-; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: li a4, 48
+; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-ZACAS-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB116_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB116_1
; RV64IA-WMO-ZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-ZACAS-NEXT: ret
;
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: li a5, 48
-; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: li a4, 48
+; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-ZACAS-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB116_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB116_1
; RV64IA-TSO-ZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-ZACAS-NEXT: ret
;
; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_acquire:
@@ -17124,63 +17654,63 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_min_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: lui a4, 16
-; RV32IA-WMO-NEXT: addi a4, a4, -1
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: lui a3, 16
+; RV32IA-WMO-NEXT: addi a3, a3, -1
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srai a1, a1, 16
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: li a5, 16
-; RV32IA-WMO-NEXT: sub a5, a5, a3
+; RV32IA-WMO-NEXT: li a4, 16
+; RV32IA-WMO-NEXT: sub a4, a4, a0
; RV32IA-WMO-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT: lr.w a3, (a2)
-; RV32IA-WMO-NEXT: and a7, a3, a4
-; RV32IA-WMO-NEXT: mv a6, a3
-; RV32IA-WMO-NEXT: sll a7, a7, a5
-; RV32IA-WMO-NEXT: sra a7, a7, a5
+; RV32IA-WMO-NEXT: lr.w a5, (a2)
+; RV32IA-WMO-NEXT: and a7, a5, a3
+; RV32IA-WMO-NEXT: mv a6, a5
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a1, a7, .LBB117_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV32IA-WMO-NEXT: xor a6, a3, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
-; RV32IA-WMO-NEXT: xor a6, a3, a6
+; RV32IA-WMO-NEXT: xor a6, a5, a1
+; RV32IA-WMO-NEXT: and a6, a6, a3
+; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1
; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2)
; RV32IA-WMO-NEXT: bnez a6, .LBB117_1
; RV32IA-WMO-NEXT: # %bb.4:
-; RV32IA-WMO-NEXT: srl a0, a3, a0
+; RV32IA-WMO-NEXT: srl a0, a5, a0
; RV32IA-WMO-NEXT: ret
;
; RV32IA-TSO-LABEL: atomicrmw_min_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: lui a4, 16
-; RV32IA-TSO-NEXT: addi a4, a4, -1
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: lui a3, 16
+; RV32IA-TSO-NEXT: addi a3, a3, -1
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srai a1, a1, 16
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: li a5, 16
-; RV32IA-TSO-NEXT: sub a5, a5, a3
+; RV32IA-TSO-NEXT: li a4, 16
+; RV32IA-TSO-NEXT: sub a4, a4, a0
; RV32IA-TSO-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT: lr.w a3, (a2)
-; RV32IA-TSO-NEXT: and a7, a3, a4
-; RV32IA-TSO-NEXT: mv a6, a3
-; RV32IA-TSO-NEXT: sll a7, a7, a5
-; RV32IA-TSO-NEXT: sra a7, a7, a5
+; RV32IA-TSO-NEXT: lr.w a5, (a2)
+; RV32IA-TSO-NEXT: and a7, a5, a3
+; RV32IA-TSO-NEXT: mv a6, a5
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a1, a7, .LBB117_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV32IA-TSO-NEXT: xor a6, a3, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
-; RV32IA-TSO-NEXT: xor a6, a3, a6
+; RV32IA-TSO-NEXT: xor a6, a5, a1
+; RV32IA-TSO-NEXT: and a6, a6, a3
+; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
; RV32IA-TSO-NEXT: bnez a6, .LBB117_1
; RV32IA-TSO-NEXT: # %bb.4:
-; RV32IA-TSO-NEXT: srl a0, a3, a0
+; RV32IA-TSO-NEXT: srl a0, a5, a0
; RV32IA-TSO-NEXT: ret
;
; RV64I-LABEL: atomicrmw_min_i16_release:
@@ -17228,125 +17758,125 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-NOZACAS-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB117_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB117_1
; RV64IA-WMO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-NOZACAS-NEXT: ret
;
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-NOZACAS-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB117_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB117_1
; RV64IA-TSO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-NOZACAS-NEXT: ret
;
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: li a5, 48
-; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: li a4, 48
+; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-ZACAS-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB117_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB117_1
; RV64IA-WMO-ZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-ZACAS-NEXT: ret
;
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: li a5, 48
-; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: li a4, 48
+; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-ZACAS-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB117_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB117_1
; RV64IA-TSO-ZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-ZACAS-NEXT: ret
;
; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_release:
@@ -17408,63 +17938,63 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_min_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
-; RV32IA-WMO-NEXT: andi a3, a0, 24
-; RV32IA-WMO-NEXT: lui a4, 16
-; RV32IA-WMO-NEXT: addi a4, a4, -1
-; RV32IA-WMO-NEXT: sll a4, a4, a0
+; RV32IA-WMO-NEXT: lui a3, 16
+; RV32IA-WMO-NEXT: addi a3, a3, -1
+; RV32IA-WMO-NEXT: sll a3, a3, a0
; RV32IA-WMO-NEXT: slli a1, a1, 16
; RV32IA-WMO-NEXT: srai a1, a1, 16
; RV32IA-WMO-NEXT: sll a1, a1, a0
-; RV32IA-WMO-NEXT: li a5, 16
-; RV32IA-WMO-NEXT: sub a5, a5, a3
+; RV32IA-WMO-NEXT: li a4, 16
+; RV32IA-WMO-NEXT: sub a4, a4, a0
; RV32IA-WMO-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT: lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT: and a7, a3, a4
-; RV32IA-WMO-NEXT: mv a6, a3
-; RV32IA-WMO-NEXT: sll a7, a7, a5
-; RV32IA-WMO-NEXT: sra a7, a7, a5
+; RV32IA-WMO-NEXT: lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT: and a7, a5, a3
+; RV32IA-WMO-NEXT: mv a6, a5
+; RV32IA-WMO-NEXT: sll a7, a7, a4
+; RV32IA-WMO-NEXT: sra a7, a7, a4
; RV32IA-WMO-NEXT: bge a1, a7, .LBB118_3
; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV32IA-WMO-NEXT: xor a6, a3, a1
-; RV32IA-WMO-NEXT: and a6, a6, a4
-; RV32IA-WMO-NEXT: xor a6, a3, a6
+; RV32IA-WMO-NEXT: xor a6, a5, a1
+; RV32IA-WMO-NEXT: and a6, a6, a3
+; RV32IA-WMO-NEXT: xor a6, a5, a6
; RV32IA-WMO-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1
; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2)
; RV32IA-WMO-NEXT: bnez a6, .LBB118_1
; RV32IA-WMO-NEXT: # %bb.4:
-; RV32IA-WMO-NEXT: srl a0, a3, a0
+; RV32IA-WMO-NEXT: srl a0, a5, a0
; RV32IA-WMO-NEXT: ret
;
; RV32IA-TSO-LABEL: atomicrmw_min_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
-; RV32IA-TSO-NEXT: andi a3, a0, 24
-; RV32IA-TSO-NEXT: lui a4, 16
-; RV32IA-TSO-NEXT: addi a4, a4, -1
-; RV32IA-TSO-NEXT: sll a4, a4, a0
+; RV32IA-TSO-NEXT: lui a3, 16
+; RV32IA-TSO-NEXT: addi a3, a3, -1
+; RV32IA-TSO-NEXT: sll a3, a3, a0
; RV32IA-TSO-NEXT: slli a1, a1, 16
; RV32IA-TSO-NEXT: srai a1, a1, 16
; RV32IA-TSO-NEXT: sll a1, a1, a0
-; RV32IA-TSO-NEXT: li a5, 16
-; RV32IA-TSO-NEXT: sub a5, a5, a3
+; RV32IA-TSO-NEXT: li a4, 16
+; RV32IA-TSO-NEXT: sub a4, a4, a0
; RV32IA-TSO-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT: lr.w a3, (a2)
-; RV32IA-TSO-NEXT: and a7, a3, a4
-; RV32IA-TSO-NEXT: mv a6, a3
-; RV32IA-TSO-NEXT: sll a7, a7, a5
-; RV32IA-TSO-NEXT: sra a7, a7, a5
+; RV32IA-TSO-NEXT: lr.w a5, (a2)
+; RV32IA-TSO-NEXT: and a7, a5, a3
+; RV32IA-TSO-NEXT: mv a6, a5
+; RV32IA-TSO-NEXT: sll a7, a7, a4
+; RV32IA-TSO-NEXT: sra a7, a7, a4
; RV32IA-TSO-NEXT: bge a1, a7, .LBB118_3
; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV32IA-TSO-NEXT: xor a6, a3, a1
-; RV32IA-TSO-NEXT: and a6, a6, a4
-; RV32IA-TSO-NEXT: xor a6, a3, a6
+; RV32IA-TSO-NEXT: xor a6, a5, a1
+; RV32IA-TSO-NEXT: and a6, a6, a3
+; RV32IA-TSO-NEXT: xor a6, a5, a6
; RV32IA-TSO-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1
; RV32IA-TSO-NEXT: sc.w a6, a6, (a2)
; RV32IA-TSO-NEXT: bnez a6, .LBB118_1
; RV32IA-TSO-NEXT: # %bb.4:
-; RV32IA-TSO-NEXT: srl a0, a3, a0
+; RV32IA-TSO-NEXT: srl a0, a5, a0
; RV32IA-TSO-NEXT: ret
;
; RV64I-LABEL: atomicrmw_min_i16_acq_rel:
@@ -17512,125 +18042,125 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT: li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-NOZACAS-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB118_3
; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-NOZACAS-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1
; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB118_1
; RV64IA-WMO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-NOZACAS-NEXT: ret
;
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT: li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-NOZACAS-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB118_3
; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-NOZACAS-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1
; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB118_1
; RV64IA-TSO-NOZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-NOZACAS-NEXT: ret
;
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT: lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
+; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT: li a5, 48
-; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: li a4, 48
+; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-WMO-ZACAS-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT: mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT: mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB118_3
; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-WMO-ZACAS-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1
; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB118_1
; RV64IA-WMO-ZACAS-NEXT: # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-WMO-ZACAS-NEXT: ret
;
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT: lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
+; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT: li a5, 48
-; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: li a4, 48
+; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-TSO-ZACAS-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT: mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT: mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB118_3
; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-TSO-ZACAS-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1
; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2)
; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB118_1
; RV64IA-TSO-ZACAS-NEXT: # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-TSO-ZACAS-NEXT: ret
;
; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_acq_rel:
@@ -17692,32 +18222,32 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_min_i16_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: lui a4, 16
-; RV32IA-NEXT: addi a4, a4, -1
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: lui a3, 16
+; RV32IA-NEXT: addi a3, a3, -1
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 16
; RV32IA-NEXT: srai a1, a1, 16
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: li a5, 16
-; RV32IA-NEXT: sub a5, a5, a3
+; RV32IA-NEXT: li a4, 16
+; RV32IA-NEXT: sub a4, a4, a0
; RV32IA-NEXT: .LBB119_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT: lr.w.aqrl a3, (a2)
-; RV32IA-NEXT: and a7, a3, a4
-; RV32IA-NEXT: mv a6, a3
-; RV32IA-NEXT: sll a7, a7, a5
-; RV32IA-NEXT: sra a7, a7, a5
+; RV32IA-NEXT: lr.w.aqrl a5, (a2)
+; RV32IA-NEXT: and a7, a5, a3
+; RV32IA-NEXT: mv a6, a5
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a1, a7, .LBB119_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB119_1 Depth=1
-; RV32IA-NEXT: xor a6, a3, a1
-; RV32IA-NEXT: and a6, a6, a4
-; RV32IA-NEXT: xor a6, a3, a6
+; RV32IA-NEXT: xor a6, a5, a1
+; RV32IA-NEXT: and a6, a6, a3
+; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB119_3: # in Loop: Header=BB119_1 Depth=1
; RV32IA-NEXT: sc.w.rl a6, a6, (a2)
; RV32IA-NEXT: bnez a6, .LBB119_1
; RV32IA-NEXT: # %bb.4:
-; RV32IA-NEXT: srl a0, a3, a0
+; RV32IA-NEXT: srl a0, a5, a0
; RV32IA-NEXT: ret
;
; RV64I-LABEL: atomicrmw_min_i16_seq_cst:
@@ -17765,63 +18295,63 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_min_i16_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT: andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT: lui a4, 16
-; RV64IA-NOZACAS-NEXT: addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT: lui a3, 16
+; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
+; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0
; RV64IA-NOZACAS-NEXT: slli a1, a1, 48
; RV64IA-NOZACAS-NEXT: srai a1, a1, 48
; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT: li a5, 48
-; RV64IA-NOZACAS-NEXT: sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT: li a4, 48
+; RV64IA-NOZACAS-NEXT: sub a4, a4, a0
; RV64IA-NOZACAS-NEXT: .LBB119_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT: lr.w.aqrl a3, (a2)
-; RV64IA-NOZACAS-NEXT: and a7, a3, a4
-; RV64IA-NOZACAS-NEXT: mv a6, a3
-; RV64IA-NOZACAS-NEXT: sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT: sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT: lr.w.aqrl a5, (a2)
+; RV64IA-NOZACAS-NEXT: and a7, a5, a3
+; RV64IA-NOZACAS-NEXT: mv a6, a5
+; RV64IA-NOZACAS-NEXT: sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT: sra a7, a7, a4
; RV64IA-NOZACAS-NEXT: bge a1, a7, .LBB119_3
; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB119_1 Depth=1
-; RV64IA-NOZACAS-NEXT: xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT: and a6, a6, a4
-; RV64IA-NOZACAS-NEXT: xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT: xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT: and a6, a6, a3
+; RV64IA-NOZACAS-NEXT: xor a6, a5, a6
; RV64IA-NOZACAS-NEXT: .LBB119_3: # in Loop: Header=BB119_1 Depth=1
; RV64IA-NOZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-NOZACAS-NEXT: bnez a6, .LBB119_1
; RV64IA-NOZACAS-NEXT: # %bb.4:
-; RV64IA-NOZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT: srlw a0, a5, a0
; RV64IA-NOZACAS-NEXT: ret
;
; RV64IA-ZACAS-LABEL: atomicrmw_min_i16_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
-; RV64IA-ZACAS-NEXT: andi a3, a0, 24
-; RV64IA-ZACAS-NEXT: lui a4, 16
-; RV64IA-ZACAS-NEXT: addi a4, a4, -1
-; RV64IA-ZACAS-NEXT: sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT: lui a3, 16
+; RV64IA-ZACAS-NEXT: addi a3, a3, -1
+; RV64IA-ZACAS-NEXT: sllw a3, a3, a0
; RV64IA-ZACAS-NEXT: slli a1, a1, 48
; RV64IA-ZACAS-NEXT: srai a1, a1, 48
; RV64IA-ZACAS-NEXT: sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT: li a5, 48
-; RV64IA-ZACAS-NEXT: sub a5, a5, a3
+; RV64IA-ZACAS-NEXT: li a4, 48
+; RV64IA-ZACAS-NEXT: sub a4, a4, a0
; RV64IA-ZACAS-NEXT: .LBB119_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT: lr.w.aqrl a3, (a2)
-; RV64IA-ZACAS-NEXT: and a7, a3, a4
-; RV64IA-ZACAS-NEXT: mv a6, a3
-; RV64IA-ZACAS-NEXT: sll a7, a7, a5
-; RV64IA-ZACAS-NEXT: sra a7, a7, a5
+; RV64IA-ZACAS-NEXT: lr.w.aqrl a5, (a2)
+; RV64IA-ZACAS-NEXT: and a7, a5, a3
+; RV64IA-ZACAS-NEXT: mv a6, a5
+; RV64IA-ZACAS-NEXT: sll a7, a7, a4
+; RV64IA-ZACAS-NEXT: sra a7, a7, a4
; RV64IA-ZACAS-NEXT: bge a1, a7, .LBB119_3
; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB119_1 Depth=1
-; RV64IA-ZACAS-NEXT: xor a6, a3, a1
-; RV64IA-ZACAS-NEXT: and a6, a6, a4
-; RV64IA-ZACAS-NEXT: xor a6, a3, a6
+; RV64IA-ZACAS-NEXT: xor a6, a5, a1
+; RV64IA-ZACAS-NEXT: and a6, a6, a3
+; RV64IA-ZACAS-NEXT: xor a6, a5, a6
; RV64IA-ZACAS-NEXT: .LBB119_3: # in Loop: Header=BB119_1 Depth=1
; RV64IA-ZACAS-NEXT: sc.w.rl a6, a6, (a2)
; RV64IA-ZACAS-NEXT: bnez a6, .LBB119_1
; RV64IA-ZACAS-NEXT: # %bb.4:
-; RV64IA-ZACAS-NEXT: srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT: srlw a0, a5, a0
; RV64IA-ZACAS-NEXT: ret
;
; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_seq_cst:
@@ -17885,6 +18415,7 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umax_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -17954,6 +18485,7 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_umax_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -17979,6 +18511,7 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_umax_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -18062,6 +18595,7 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umax_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -18087,6 +18621,7 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umax_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -18156,6 +18691,7 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -18181,6 +18717,7 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umax_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -18206,6 +18743,7 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umax_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -18231,6 +18769,7 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umax_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -18314,6 +18853,7 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umax_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -18339,6 +18879,7 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umax_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -18408,6 +18949,7 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -18433,6 +18975,7 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umax_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -18458,6 +19001,7 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umax_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -18483,6 +19027,7 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umax_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -18566,6 +19111,7 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umax_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -18591,6 +19137,7 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umax_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -18660,6 +19207,7 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -18685,6 +19233,7 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umax_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -18710,6 +19259,7 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umax_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -18735,6 +19285,7 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umax_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -18818,6 +19369,7 @@ define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umax_i16_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -18887,6 +19439,7 @@ define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_umax_i16_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -18912,6 +19465,7 @@ define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_umax_i16_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -18995,6 +19549,7 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umin_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -19064,6 +19619,7 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_umin_i16_monotonic:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -19089,6 +19645,7 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_umin_i16_monotonic:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
@@ -19172,6 +19729,7 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umin_i16_acquire:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -19197,6 +19755,7 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umin_i16_acquire:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -19266,6 +19825,7 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i16_acquire:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -19291,6 +19851,7 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umin_i16_acquire:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -19316,6 +19877,7 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umin_i16_acquire:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -19341,6 +19903,7 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umin_i16_acquire:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -19424,6 +19987,7 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umin_i16_release:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -19449,6 +20013,7 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umin_i16_release:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -19518,6 +20083,7 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i16_release:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -19543,6 +20109,7 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umin_i16_release:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -19568,6 +20135,7 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umin_i16_release:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -19593,6 +20161,7 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umin_i16_release:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -19676,6 +20245,7 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-WMO-LABEL: atomicrmw_umin_i16_acq_rel:
; RV32IA-WMO: # %bb.0:
; RV32IA-WMO-NEXT: andi a2, a0, -4
+; RV32IA-WMO-NEXT: andi a0, a0, 3
; RV32IA-WMO-NEXT: slli a0, a0, 3
; RV32IA-WMO-NEXT: lui a3, 16
; RV32IA-WMO-NEXT: addi a3, a3, -1
@@ -19701,6 +20271,7 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV32IA-TSO-LABEL: atomicrmw_umin_i16_acq_rel:
; RV32IA-TSO: # %bb.0:
; RV32IA-TSO-NEXT: andi a2, a0, -4
+; RV32IA-TSO-NEXT: andi a0, a0, 3
; RV32IA-TSO-NEXT: slli a0, a0, 3
; RV32IA-TSO-NEXT: lui a3, 16
; RV32IA-TSO-NEXT: addi a3, a3, -1
@@ -19770,6 +20341,7 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i16_acq_rel:
; RV64IA-WMO-NOZACAS: # %bb.0:
; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16
; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1
@@ -19795,6 +20367,7 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_umin_i16_acq_rel:
; RV64IA-TSO-NOZACAS: # %bb.0:
; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16
; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1
@@ -19820,6 +20393,7 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-WMO-ZACAS-LABEL: atomicrmw_umin_i16_acq_rel:
; RV64IA-WMO-ZACAS: # %bb.0:
; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-WMO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-WMO-ZACAS-NEXT: lui a3, 16
; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1
@@ -19845,6 +20419,7 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
; RV64IA-TSO-ZACAS-LABEL: atomicrmw_umin_i16_acq_rel:
; RV64IA-TSO-ZACAS: # %bb.0:
; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-TSO-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-TSO-ZACAS-NEXT: lui a3, 16
; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1
@@ -19928,6 +20503,7 @@ define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umin_i16_seq_cst:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -19997,6 +20573,7 @@ define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-NOZACAS-LABEL: atomicrmw_umin_i16_seq_cst:
; RV64IA-NOZACAS: # %bb.0:
; RV64IA-NOZACAS-NEXT: andi a2, a0, -4
+; RV64IA-NOZACAS-NEXT: andi a0, a0, 3
; RV64IA-NOZACAS-NEXT: slli a0, a0, 3
; RV64IA-NOZACAS-NEXT: lui a3, 16
; RV64IA-NOZACAS-NEXT: addi a3, a3, -1
@@ -20022,6 +20599,7 @@ define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind {
; RV64IA-ZACAS-LABEL: atomicrmw_umin_i16_seq_cst:
; RV64IA-ZACAS: # %bb.0:
; RV64IA-ZACAS-NEXT: andi a2, a0, -4
+; RV64IA-ZACAS-NEXT: andi a0, a0, 3
; RV64IA-ZACAS-NEXT: slli a0, a0, 3
; RV64IA-ZACAS-NEXT: lui a3, 16
; RV64IA-ZACAS-NEXT: addi a3, a3, -1
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index f7268f62881276..b6d238b403995f 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -140,6 +140,7 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_xchg_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -174,6 +175,7 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_xchg_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: li a3, 255
; RV64IA-NEXT: sllw a3, a3, a0
@@ -212,6 +214,7 @@ define signext i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_add_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -246,6 +249,7 @@ define signext i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_add_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: li a3, 255
; RV64IA-NEXT: sllw a3, a3, a0
@@ -284,6 +288,7 @@ define signext i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_sub_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -318,6 +323,7 @@ define signext i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_sub_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: li a3, 255
; RV64IA-NEXT: sllw a3, a3, a0
@@ -356,6 +362,7 @@ define signext i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_and_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -384,6 +391,7 @@ define signext i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_and_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: li a3, 255
; RV64IA-NEXT: sllw a3, a3, a0
@@ -416,6 +424,7 @@ define signext i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_nand_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -451,6 +460,7 @@ define signext i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_nand_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: li a3, 255
; RV64IA-NEXT: sllw a3, a3, a0
@@ -490,6 +500,7 @@ define signext i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_or_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: andi a1, a1, 255
; RV32IA-NEXT: sll a1, a1, a0
@@ -514,6 +525,7 @@ define signext i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_or_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: andi a1, a1, 255
; RV64IA-NEXT: sllw a1, a1, a0
@@ -542,6 +554,7 @@ define signext i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_xor_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: andi a1, a1, 255
; RV32IA-NEXT: sll a1, a1, a0
@@ -566,6 +579,7 @@ define signext i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_xor_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: andi a1, a1, 255
; RV64IA-NEXT: sllw a1, a1, a0
@@ -625,24 +639,24 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_max_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: li a4, 255
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: li a3, 255
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 24
; RV32IA-NEXT: srai a1, a1, 24
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: xori a3, a3, 24
+; RV32IA-NEXT: xori a4, a0, 24
; RV32IA-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
; RV32IA-NEXT: lr.w a5, (a2)
-; RV32IA-NEXT: and a7, a5, a4
+; RV32IA-NEXT: and a7, a5, a3
; RV32IA-NEXT: mv a6, a5
-; RV32IA-NEXT: sll a7, a7, a3
-; RV32IA-NEXT: sra a7, a7, a3
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a7, a1, .LBB10_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB10_1 Depth=1
; RV32IA-NEXT: xor a6, a5, a1
-; RV32IA-NEXT: and a6, a6, a4
+; RV32IA-NEXT: and a6, a6, a3
; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB10_3: # in Loop: Header=BB10_1 Depth=1
; RV32IA-NEXT: sc.w a6, a6, (a2)
@@ -699,24 +713,24 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_max_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
-; RV64IA-NEXT: andi a3, a0, 24
-; RV64IA-NEXT: li a4, 255
-; RV64IA-NEXT: sllw a4, a4, a0
+; RV64IA-NEXT: li a3, 255
+; RV64IA-NEXT: sllw a3, a3, a0
; RV64IA-NEXT: slli a1, a1, 56
; RV64IA-NEXT: srai a1, a1, 56
; RV64IA-NEXT: sllw a1, a1, a0
-; RV64IA-NEXT: xori a3, a3, 56
+; RV64IA-NEXT: xori a4, a0, 56
; RV64IA-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
; RV64IA-NEXT: lr.w a5, (a2)
-; RV64IA-NEXT: and a7, a5, a4
+; RV64IA-NEXT: and a7, a5, a3
; RV64IA-NEXT: mv a6, a5
-; RV64IA-NEXT: sll a7, a7, a3
-; RV64IA-NEXT: sra a7, a7, a3
+; RV64IA-NEXT: sll a7, a7, a4
+; RV64IA-NEXT: sra a7, a7, a4
; RV64IA-NEXT: bge a7, a1, .LBB10_3
; RV64IA-NEXT: # %bb.2: # in Loop: Header=BB10_1 Depth=1
; RV64IA-NEXT: xor a6, a5, a1
-; RV64IA-NEXT: and a6, a6, a4
+; RV64IA-NEXT: and a6, a6, a3
; RV64IA-NEXT: xor a6, a5, a6
; RV64IA-NEXT: .LBB10_3: # in Loop: Header=BB10_1 Depth=1
; RV64IA-NEXT: sc.w a6, a6, (a2)
@@ -777,24 +791,24 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_min_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: li a4, 255
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: li a3, 255
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 24
; RV32IA-NEXT: srai a1, a1, 24
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: xori a3, a3, 24
+; RV32IA-NEXT: xori a4, a0, 24
; RV32IA-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
; RV32IA-NEXT: lr.w a5, (a2)
-; RV32IA-NEXT: and a7, a5, a4
+; RV32IA-NEXT: and a7, a5, a3
; RV32IA-NEXT: mv a6, a5
-; RV32IA-NEXT: sll a7, a7, a3
-; RV32IA-NEXT: sra a7, a7, a3
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a1, a7, .LBB11_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB11_1 Depth=1
; RV32IA-NEXT: xor a6, a5, a1
-; RV32IA-NEXT: and a6, a6, a4
+; RV32IA-NEXT: and a6, a6, a3
; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB11_3: # in Loop: Header=BB11_1 Depth=1
; RV32IA-NEXT: sc.w a6, a6, (a2)
@@ -851,24 +865,24 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_min_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
-; RV64IA-NEXT: andi a3, a0, 24
-; RV64IA-NEXT: li a4, 255
-; RV64IA-NEXT: sllw a4, a4, a0
+; RV64IA-NEXT: li a3, 255
+; RV64IA-NEXT: sllw a3, a3, a0
; RV64IA-NEXT: slli a1, a1, 56
; RV64IA-NEXT: srai a1, a1, 56
; RV64IA-NEXT: sllw a1, a1, a0
-; RV64IA-NEXT: xori a3, a3, 56
+; RV64IA-NEXT: xori a4, a0, 56
; RV64IA-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
; RV64IA-NEXT: lr.w a5, (a2)
-; RV64IA-NEXT: and a7, a5, a4
+; RV64IA-NEXT: and a7, a5, a3
; RV64IA-NEXT: mv a6, a5
-; RV64IA-NEXT: sll a7, a7, a3
-; RV64IA-NEXT: sra a7, a7, a3
+; RV64IA-NEXT: sll a7, a7, a4
+; RV64IA-NEXT: sra a7, a7, a4
; RV64IA-NEXT: bge a1, a7, .LBB11_3
; RV64IA-NEXT: # %bb.2: # in Loop: Header=BB11_1 Depth=1
; RV64IA-NEXT: xor a6, a5, a1
-; RV64IA-NEXT: and a6, a6, a4
+; RV64IA-NEXT: and a6, a6, a3
; RV64IA-NEXT: xor a6, a5, a6
; RV64IA-NEXT: .LBB11_3: # in Loop: Header=BB11_1 Depth=1
; RV64IA-NEXT: sc.w a6, a6, (a2)
@@ -927,6 +941,7 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umax_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -994,6 +1009,7 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_umax_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: li a3, 255
; RV64IA-NEXT: sllw a3, a3, a0
@@ -1065,6 +1081,7 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umin_i8_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: sll a3, a3, a0
@@ -1132,6 +1149,7 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
; RV64IA-LABEL: atomicrmw_umin_i8_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: li a3, 255
; RV64IA-NEXT: sllw a3, a3, a0
@@ -1174,6 +1192,7 @@ define signext i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_xchg_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -1209,6 +1228,7 @@ define signext i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_xchg_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a3, 16
; RV64IA-NEXT: addi a3, a3, -1
@@ -1248,6 +1268,7 @@ define signext i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_add_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -1283,6 +1304,7 @@ define signext i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_add_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a3, 16
; RV64IA-NEXT: addi a3, a3, -1
@@ -1322,6 +1344,7 @@ define signext i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_sub_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -1357,6 +1380,7 @@ define signext i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_sub_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a3, 16
; RV64IA-NEXT: addi a3, a3, -1
@@ -1396,6 +1420,7 @@ define signext i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_and_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -1425,6 +1450,7 @@ define signext i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_and_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a3, 16
; RV64IA-NEXT: addi a3, a3, -1
@@ -1458,6 +1484,7 @@ define signext i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_nand_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -1494,6 +1521,7 @@ define signext i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_nand_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a3, 16
; RV64IA-NEXT: addi a3, a3, -1
@@ -1534,6 +1562,7 @@ define signext i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_or_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: slli a1, a1, 16
; RV32IA-NEXT: srli a1, a1, 16
@@ -1559,6 +1588,7 @@ define signext i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_or_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: slli a1, a1, 48
; RV64IA-NEXT: srli a1, a1, 48
@@ -1588,6 +1618,7 @@ define signext i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_xor_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: slli a1, a1, 16
; RV32IA-NEXT: srli a1, a1, 16
@@ -1613,6 +1644,7 @@ define signext i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_xor_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: slli a1, a1, 48
; RV64IA-NEXT: srli a1, a1, 48
@@ -1673,32 +1705,32 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_max_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: lui a4, 16
-; RV32IA-NEXT: addi a4, a4, -1
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: lui a3, 16
+; RV32IA-NEXT: addi a3, a3, -1
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 16
; RV32IA-NEXT: srai a1, a1, 16
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: li a5, 16
-; RV32IA-NEXT: sub a5, a5, a3
+; RV32IA-NEXT: li a4, 16
+; RV32IA-NEXT: sub a4, a4, a0
; RV32IA-NEXT: .LBB21_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT: lr.w a3, (a2)
-; RV32IA-NEXT: and a7, a3, a4
-; RV32IA-NEXT: mv a6, a3
-; RV32IA-NEXT: sll a7, a7, a5
-; RV32IA-NEXT: sra a7, a7, a5
+; RV32IA-NEXT: lr.w a5, (a2)
+; RV32IA-NEXT: and a7, a5, a3
+; RV32IA-NEXT: mv a6, a5
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a7, a1, .LBB21_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB21_1 Depth=1
-; RV32IA-NEXT: xor a6, a3, a1
-; RV32IA-NEXT: and a6, a6, a4
-; RV32IA-NEXT: xor a6, a3, a6
+; RV32IA-NEXT: xor a6, a5, a1
+; RV32IA-NEXT: and a6, a6, a3
+; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB21_3: # in Loop: Header=BB21_1 Depth=1
; RV32IA-NEXT: sc.w a6, a6, (a2)
; RV32IA-NEXT: bnez a6, .LBB21_1
; RV32IA-NEXT: # %bb.4:
-; RV32IA-NEXT: srl a0, a3, a0
+; RV32IA-NEXT: srl a0, a5, a0
; RV32IA-NEXT: slli a0, a0, 16
; RV32IA-NEXT: srai a0, a0, 16
; RV32IA-NEXT: ret
@@ -1749,32 +1781,32 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_max_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
-; RV64IA-NEXT: andi a3, a0, 24
-; RV64IA-NEXT: lui a4, 16
-; RV64IA-NEXT: addi a4, a4, -1
-; RV64IA-NEXT: sllw a4, a4, a0
+; RV64IA-NEXT: lui a3, 16
+; RV64IA-NEXT: addi a3, a3, -1
+; RV64IA-NEXT: sllw a3, a3, a0
; RV64IA-NEXT: slli a1, a1, 48
; RV64IA-NEXT: srai a1, a1, 48
; RV64IA-NEXT: sllw a1, a1, a0
-; RV64IA-NEXT: li a5, 48
-; RV64IA-NEXT: sub a5, a5, a3
+; RV64IA-NEXT: li a4, 48
+; RV64IA-NEXT: sub a4, a4, a0
; RV64IA-NEXT: .LBB21_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NEXT: lr.w a3, (a2)
-; RV64IA-NEXT: and a7, a3, a4
-; RV64IA-NEXT: mv a6, a3
-; RV64IA-NEXT: sll a7, a7, a5
-; RV64IA-NEXT: sra a7, a7, a5
+; RV64IA-NEXT: lr.w a5, (a2)
+; RV64IA-NEXT: and a7, a5, a3
+; RV64IA-NEXT: mv a6, a5
+; RV64IA-NEXT: sll a7, a7, a4
+; RV64IA-NEXT: sra a7, a7, a4
; RV64IA-NEXT: bge a7, a1, .LBB21_3
; RV64IA-NEXT: # %bb.2: # in Loop: Header=BB21_1 Depth=1
-; RV64IA-NEXT: xor a6, a3, a1
-; RV64IA-NEXT: and a6, a6, a4
-; RV64IA-NEXT: xor a6, a3, a6
+; RV64IA-NEXT: xor a6, a5, a1
+; RV64IA-NEXT: and a6, a6, a3
+; RV64IA-NEXT: xor a6, a5, a6
; RV64IA-NEXT: .LBB21_3: # in Loop: Header=BB21_1 Depth=1
; RV64IA-NEXT: sc.w a6, a6, (a2)
; RV64IA-NEXT: bnez a6, .LBB21_1
; RV64IA-NEXT: # %bb.4:
-; RV64IA-NEXT: srlw a0, a3, a0
+; RV64IA-NEXT: srlw a0, a5, a0
; RV64IA-NEXT: slli a0, a0, 48
; RV64IA-NEXT: srai a0, a0, 48
; RV64IA-NEXT: ret
@@ -1829,32 +1861,32 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_min_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
-; RV32IA-NEXT: andi a3, a0, 24
-; RV32IA-NEXT: lui a4, 16
-; RV32IA-NEXT: addi a4, a4, -1
-; RV32IA-NEXT: sll a4, a4, a0
+; RV32IA-NEXT: lui a3, 16
+; RV32IA-NEXT: addi a3, a3, -1
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: slli a1, a1, 16
; RV32IA-NEXT: srai a1, a1, 16
; RV32IA-NEXT: sll a1, a1, a0
-; RV32IA-NEXT: li a5, 16
-; RV32IA-NEXT: sub a5, a5, a3
+; RV32IA-NEXT: li a4, 16
+; RV32IA-NEXT: sub a4, a4, a0
; RV32IA-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT: lr.w a3, (a2)
-; RV32IA-NEXT: and a7, a3, a4
-; RV32IA-NEXT: mv a6, a3
-; RV32IA-NEXT: sll a7, a7, a5
-; RV32IA-NEXT: sra a7, a7, a5
+; RV32IA-NEXT: lr.w a5, (a2)
+; RV32IA-NEXT: and a7, a5, a3
+; RV32IA-NEXT: mv a6, a5
+; RV32IA-NEXT: sll a7, a7, a4
+; RV32IA-NEXT: sra a7, a7, a4
; RV32IA-NEXT: bge a1, a7, .LBB22_3
; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB22_1 Depth=1
-; RV32IA-NEXT: xor a6, a3, a1
-; RV32IA-NEXT: and a6, a6, a4
-; RV32IA-NEXT: xor a6, a3, a6
+; RV32IA-NEXT: xor a6, a5, a1
+; RV32IA-NEXT: and a6, a6, a3
+; RV32IA-NEXT: xor a6, a5, a6
; RV32IA-NEXT: .LBB22_3: # in Loop: Header=BB22_1 Depth=1
; RV32IA-NEXT: sc.w a6, a6, (a2)
; RV32IA-NEXT: bnez a6, .LBB22_1
; RV32IA-NEXT: # %bb.4:
-; RV32IA-NEXT: srl a0, a3, a0
+; RV32IA-NEXT: srl a0, a5, a0
; RV32IA-NEXT: slli a0, a0, 16
; RV32IA-NEXT: srai a0, a0, 16
; RV32IA-NEXT: ret
@@ -1905,32 +1937,32 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_min_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
-; RV64IA-NEXT: andi a3, a0, 24
-; RV64IA-NEXT: lui a4, 16
-; RV64IA-NEXT: addi a4, a4, -1
-; RV64IA-NEXT: sllw a4, a4, a0
+; RV64IA-NEXT: lui a3, 16
+; RV64IA-NEXT: addi a3, a3, -1
+; RV64IA-NEXT: sllw a3, a3, a0
; RV64IA-NEXT: slli a1, a1, 48
; RV64IA-NEXT: srai a1, a1, 48
; RV64IA-NEXT: sllw a1, a1, a0
-; RV64IA-NEXT: li a5, 48
-; RV64IA-NEXT: sub a5, a5, a3
+; RV64IA-NEXT: li a4, 48
+; RV64IA-NEXT: sub a4, a4, a0
; RV64IA-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NEXT: lr.w a3, (a2)
-; RV64IA-NEXT: and a7, a3, a4
-; RV64IA-NEXT: mv a6, a3
-; RV64IA-NEXT: sll a7, a7, a5
-; RV64IA-NEXT: sra a7, a7, a5
+; RV64IA-NEXT: lr.w a5, (a2)
+; RV64IA-NEXT: and a7, a5, a3
+; RV64IA-NEXT: mv a6, a5
+; RV64IA-NEXT: sll a7, a7, a4
+; RV64IA-NEXT: sra a7, a7, a4
; RV64IA-NEXT: bge a1, a7, .LBB22_3
; RV64IA-NEXT: # %bb.2: # in Loop: Header=BB22_1 Depth=1
-; RV64IA-NEXT: xor a6, a3, a1
-; RV64IA-NEXT: and a6, a6, a4
-; RV64IA-NEXT: xor a6, a3, a6
+; RV64IA-NEXT: xor a6, a5, a1
+; RV64IA-NEXT: and a6, a6, a3
+; RV64IA-NEXT: xor a6, a5, a6
; RV64IA-NEXT: .LBB22_3: # in Loop: Header=BB22_1 Depth=1
; RV64IA-NEXT: sc.w a6, a6, (a2)
; RV64IA-NEXT: bnez a6, .LBB22_1
; RV64IA-NEXT: # %bb.4:
-; RV64IA-NEXT: srlw a0, a3, a0
+; RV64IA-NEXT: srlw a0, a5, a0
; RV64IA-NEXT: slli a0, a0, 48
; RV64IA-NEXT: srai a0, a0, 48
; RV64IA-NEXT: ret
@@ -1987,6 +2019,7 @@ define signext i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umax_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -2059,6 +2092,7 @@ define signext i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_umax_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a3, 16
; RV64IA-NEXT: addi a3, a3, -1
@@ -2135,6 +2169,7 @@ define signext i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV32IA-LABEL: atomicrmw_umin_i16_monotonic:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
@@ -2207,6 +2242,7 @@ define signext i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
; RV64IA-LABEL: atomicrmw_umin_i16_monotonic:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a3, 16
; RV64IA-NEXT: addi a3, a3, -1
@@ -3804,6 +3840,7 @@ define signext i8 @cmpxchg_i8_monotonic_monotonic_val0(ptr %ptr, i8 signext %cmp
; RV32IA-LABEL: cmpxchg_i8_monotonic_monotonic_val0:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a4, 255
; RV32IA-NEXT: sll a4, a4, a0
@@ -3844,6 +3881,7 @@ define signext i8 @cmpxchg_i8_monotonic_monotonic_val0(ptr %ptr, i8 signext %cmp
; RV64IA-LABEL: cmpxchg_i8_monotonic_monotonic_val0:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a3, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: li a4, 255
; RV64IA-NEXT: sllw a4, a4, a0
@@ -3888,6 +3926,7 @@ define i1 @cmpxchg_i8_monotonic_monotonic_val1(ptr %ptr, i8 signext %cmp, i8 sig
; RV32IA-LABEL: cmpxchg_i8_monotonic_monotonic_val1:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: li a4, 255
; RV32IA-NEXT: sll a4, a4, a0
@@ -3927,6 +3966,7 @@ define i1 @cmpxchg_i8_monotonic_monotonic_val1(ptr %ptr, i8 signext %cmp, i8 sig
; RV64IA-LABEL: cmpxchg_i8_monotonic_monotonic_val1:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a3, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: li a4, 255
; RV64IA-NEXT: sllw a4, a4, a0
@@ -3972,6 +4012,7 @@ define signext i16 @cmpxchg_i16_monotonic_monotonic_val0(ptr %ptr, i16 signext %
; RV32IA-LABEL: cmpxchg_i16_monotonic_monotonic_val0:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a4, 16
; RV32IA-NEXT: addi a4, a4, -1
@@ -4013,6 +4054,7 @@ define signext i16 @cmpxchg_i16_monotonic_monotonic_val0(ptr %ptr, i16 signext %
; RV64IA-LABEL: cmpxchg_i16_monotonic_monotonic_val0:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a3, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a4, 16
; RV64IA-NEXT: addi a4, a4, -1
@@ -4058,6 +4100,7 @@ define i1 @cmpxchg_i16_monotonic_monotonic_val1(ptr %ptr, i16 signext %cmp, i16
; RV32IA-LABEL: cmpxchg_i16_monotonic_monotonic_val1:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a3, a0, -4
+; RV32IA-NEXT: andi a0, a0, 3
; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a4, 16
; RV32IA-NEXT: addi a4, a4, -1
@@ -4098,6 +4141,7 @@ define i1 @cmpxchg_i16_monotonic_monotonic_val1(ptr %ptr, i16 signext %cmp, i16
; RV64IA-LABEL: cmpxchg_i16_monotonic_monotonic_val1:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a3, a0, -4
+; RV64IA-NEXT: andi a0, a0, 3
; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a4, 16
; RV64IA-NEXT: addi a4, a4, -1
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index a5a2ae79966c3f..33857cc461f50d 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -53,11 +53,11 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
; RV32IA-LABEL: atomicrmw_uinc_wrap_i8:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
-; RV32IA-NEXT: slli a3, a0, 3
-; RV32IA-NEXT: andi a0, a3, 24
-; RV32IA-NEXT: li a5, 255
+; RV32IA-NEXT: andi a0, a0, 3
+; RV32IA-NEXT: slli a0, a0, 3
+; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: lw a4, 0(a2)
-; RV32IA-NEXT: sll a3, a5, a3
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: not a3, a3
; RV32IA-NEXT: andi a1, a1, 255
; RV32IA-NEXT: .LBB0_1: # %atomicrmw.start
@@ -127,11 +127,11 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
; RV64IA-LABEL: atomicrmw_uinc_wrap_i8:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
-; RV64IA-NEXT: slli a4, a0, 3
-; RV64IA-NEXT: andi a0, a4, 24
-; RV64IA-NEXT: li a5, 255
+; RV64IA-NEXT: andi a0, a0, 3
+; RV64IA-NEXT: slli a0, a0, 3
+; RV64IA-NEXT: li a4, 255
; RV64IA-NEXT: lw a3, 0(a2)
-; RV64IA-NEXT: sllw a4, a5, a4
+; RV64IA-NEXT: sllw a4, a4, a0
; RV64IA-NEXT: not a4, a4
; RV64IA-NEXT: andi a1, a1, 255
; RV64IA-NEXT: .LBB0_1: # %atomicrmw.start
@@ -210,12 +210,12 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
; RV32IA-LABEL: atomicrmw_uinc_wrap_i16:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
-; RV32IA-NEXT: slli a4, a0, 3
-; RV32IA-NEXT: andi a0, a4, 24
+; RV32IA-NEXT: andi a0, a0, 3
+; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
; RV32IA-NEXT: lw a5, 0(a2)
-; RV32IA-NEXT: sll a4, a3, a4
+; RV32IA-NEXT: sll a4, a3, a0
; RV32IA-NEXT: not a4, a4
; RV32IA-NEXT: and a1, a1, a3
; RV32IA-NEXT: .LBB1_1: # %atomicrmw.start
@@ -290,12 +290,12 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
; RV64IA-LABEL: atomicrmw_uinc_wrap_i16:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
-; RV64IA-NEXT: slli a5, a0, 3
-; RV64IA-NEXT: andi a0, a5, 24
+; RV64IA-NEXT: andi a0, a0, 3
+; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a3, 16
; RV64IA-NEXT: addiw a3, a3, -1
; RV64IA-NEXT: lw a4, 0(a2)
-; RV64IA-NEXT: sllw a5, a3, a5
+; RV64IA-NEXT: sllw a5, a3, a0
; RV64IA-NEXT: not a5, a5
; RV64IA-NEXT: and a1, a1, a3
; RV64IA-NEXT: .LBB1_1: # %atomicrmw.start
@@ -680,11 +680,11 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
; RV32IA-LABEL: atomicrmw_udec_wrap_i8:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
-; RV32IA-NEXT: slli a3, a0, 3
-; RV32IA-NEXT: andi a0, a3, 24
-; RV32IA-NEXT: li a4, 255
+; RV32IA-NEXT: andi a0, a0, 3
+; RV32IA-NEXT: slli a0, a0, 3
+; RV32IA-NEXT: li a3, 255
; RV32IA-NEXT: lw a6, 0(a2)
-; RV32IA-NEXT: sll a3, a4, a3
+; RV32IA-NEXT: sll a3, a3, a0
; RV32IA-NEXT: not a3, a3
; RV32IA-NEXT: andi a4, a1, 255
; RV32IA-NEXT: j .LBB4_2
@@ -776,11 +776,11 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
; RV64IA-LABEL: atomicrmw_udec_wrap_i8:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
-; RV64IA-NEXT: slli a4, a0, 3
-; RV64IA-NEXT: andi a0, a4, 24
-; RV64IA-NEXT: li a5, 255
+; RV64IA-NEXT: andi a0, a0, 3
+; RV64IA-NEXT: slli a0, a0, 3
+; RV64IA-NEXT: li a4, 255
; RV64IA-NEXT: lw a3, 0(a2)
-; RV64IA-NEXT: sllw a4, a5, a4
+; RV64IA-NEXT: sllw a4, a4, a0
; RV64IA-NEXT: not a4, a4
; RV64IA-NEXT: andi a5, a1, 255
; RV64IA-NEXT: j .LBB4_2
@@ -881,12 +881,12 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
; RV32IA-LABEL: atomicrmw_udec_wrap_i16:
; RV32IA: # %bb.0:
; RV32IA-NEXT: andi a2, a0, -4
-; RV32IA-NEXT: slli a4, a0, 3
-; RV32IA-NEXT: andi a0, a4, 24
+; RV32IA-NEXT: andi a0, a0, 3
+; RV32IA-NEXT: slli a0, a0, 3
; RV32IA-NEXT: lui a3, 16
; RV32IA-NEXT: addi a3, a3, -1
; RV32IA-NEXT: lw a7, 0(a2)
-; RV32IA-NEXT: sll a4, a3, a4
+; RV32IA-NEXT: sll a4, a3, a0
; RV32IA-NEXT: not a4, a4
; RV32IA-NEXT: and a5, a1, a3
; RV32IA-NEXT: j .LBB5_2
@@ -983,12 +983,12 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
; RV64IA-LABEL: atomicrmw_udec_wrap_i16:
; RV64IA: # %bb.0:
; RV64IA-NEXT: andi a2, a0, -4
-; RV64IA-NEXT: slli a5, a0, 3
-; RV64IA-NEXT: andi a0, a5, 24
+; RV64IA-NEXT: andi a0, a0, 3
+; RV64IA-NEXT: slli a0, a0, 3
; RV64IA-NEXT: lui a3, 16
; RV64IA-NEXT: addiw a3, a3, -1
; RV64IA-NEXT: lw a4, 0(a2)
-; RV64IA-NEXT: sllw a5, a3, a5
+; RV64IA-NEXT: sllw a5, a3, a0
; RV64IA-NEXT: not a5, a5
; RV64IA-NEXT: and a6, a1, a3
; RV64IA-NEXT: j .LBB5_2
diff --git a/llvm/test/CodeGen/RISCV/bittest.ll b/llvm/test/CodeGen/RISCV/bittest.ll
index d280e5ee46b7c0..13cf217f79c23d 100644
--- a/llvm/test/CodeGen/RISCV/bittest.ll
+++ b/llvm/test/CodeGen/RISCV/bittest.ll
@@ -724,12 +724,11 @@ define signext i32 @bit_31_z_select_i32(i32 signext %a, i32 signext %b, i32 sign
define signext i32 @bit_31_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
; RV32-LABEL: bit_31_nz_select_i32:
; RV32: # %bb.0:
-; RV32-NEXT: srli a3, a0, 31
-; RV32-NEXT: mv a0, a1
-; RV32-NEXT: bnez a3, .LBB22_2
+; RV32-NEXT: bltz a0, .LBB22_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: mv a0, a2
+; RV32-NEXT: mv a1, a2
; RV32-NEXT: .LBB22_2:
+; RV32-NEXT: mv a0, a1
; RV32-NEXT: ret
;
; RV64-LABEL: bit_31_nz_select_i32:
@@ -1224,12 +1223,11 @@ define i64 @bit_63_nz_select_i64(i64 %a, i64 %b, i64 %c) {
;
; RV64-LABEL: bit_63_nz_select_i64:
; RV64: # %bb.0:
-; RV64-NEXT: srli a3, a0, 63
-; RV64-NEXT: mv a0, a1
-; RV64-NEXT: bnez a3, .LBB36_2
+; RV64-NEXT: bltz a0, .LBB36_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: mv a0, a2
+; RV64-NEXT: mv a1, a2
; RV64-NEXT: .LBB36_2:
+; RV64-NEXT: mv a0, a1
; RV64-NEXT: ret
%1 = and i64 %a, 9223372036854775808
%2 = icmp ne i64 %1, 0
diff --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
index 9c69fe0a6e4865..1ab84a8eb077e9 100644
--- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
@@ -323,10 +323,11 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; RV32ZBB-LABEL: test_bitreverse_i16:
; RV32ZBB: # %bb.0:
; RV32ZBB-NEXT: rev8 a0, a0
-; RV32ZBB-NEXT: srli a1, a0, 12
-; RV32ZBB-NEXT: lui a2, 15
-; RV32ZBB-NEXT: addi a2, a2, 240
+; RV32ZBB-NEXT: srli a1, a0, 16
+; RV32ZBB-NEXT: lui a2, 1
+; RV32ZBB-NEXT: addi a2, a2, -241
; RV32ZBB-NEXT: and a1, a1, a2
+; RV32ZBB-NEXT: slli a1, a1, 4
; RV32ZBB-NEXT: srli a0, a0, 20
; RV32ZBB-NEXT: andi a0, a0, -241
; RV32ZBB-NEXT: or a0, a0, a1
@@ -349,10 +350,11 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; RV64ZBB-LABEL: test_bitreverse_i16:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: rev8 a0, a0
-; RV64ZBB-NEXT: srli a1, a0, 44
-; RV64ZBB-NEXT: lui a2, 15
-; RV64ZBB-NEXT: addiw a2, a2, 240
+; RV64ZBB-NEXT: srli a1, a0, 48
+; RV64ZBB-NEXT: lui a2, 1
+; RV64ZBB-NEXT: addiw a2, a2, -241
; RV64ZBB-NEXT: and a1, a1, a2
+; RV64ZBB-NEXT: slli a1, a1, 4
; RV64ZBB-NEXT: srli a0, a0, 52
; RV64ZBB-NEXT: andi a0, a0, -241
; RV64ZBB-NEXT: or a0, a0, a1
@@ -495,11 +497,9 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; RV64ZBB-NEXT: lui a2, 61681
; RV64ZBB-NEXT: addiw a2, a2, -241
; RV64ZBB-NEXT: and a1, a1, a2
-; RV64ZBB-NEXT: srli a0, a0, 28
-; RV64ZBB-NEXT: lui a2, 986895
-; RV64ZBB-NEXT: addi a2, a2, 240
+; RV64ZBB-NEXT: srli a0, a0, 32
; RV64ZBB-NEXT: and a0, a0, a2
-; RV64ZBB-NEXT: sext.w a0, a0
+; RV64ZBB-NEXT: slliw a0, a0, 4
; RV64ZBB-NEXT: or a0, a1, a0
; RV64ZBB-NEXT: srli a1, a0, 2
; RV64ZBB-NEXT: lui a2, 209715
@@ -1523,17 +1523,19 @@ define i64 @test_bitreverse_bswap_i64(i64 %a) nounwind {
define i32 @pr55484(i32 %0) {
; RV32I-LABEL: pr55484:
; RV32I: # %bb.0:
-; RV32I-NEXT: slli a1, a0, 8
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a1, a0, 24
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srai a0, a0, 16
; RV32I-NEXT: ret
;
; RV64I-LABEL: pr55484:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a1, a0, 40
-; RV64I-NEXT: slli a0, a0, 56
-; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 56
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: slli a0, a0, 48
+; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srai a0, a0, 48
; RV64I-NEXT: ret
;
@@ -1555,17 +1557,19 @@ define i32 @pr55484(i32 %0) {
;
; RV32ZBKB-LABEL: pr55484:
; RV32ZBKB: # %bb.0:
-; RV32ZBKB-NEXT: slli a1, a0, 8
-; RV32ZBKB-NEXT: slli a0, a0, 24
-; RV32ZBKB-NEXT: or a0, a0, a1
+; RV32ZBKB-NEXT: slli a1, a0, 24
+; RV32ZBKB-NEXT: srli a0, a0, 8
+; RV32ZBKB-NEXT: slli a0, a0, 16
+; RV32ZBKB-NEXT: or a0, a1, a0
; RV32ZBKB-NEXT: srai a0, a0, 16
; RV32ZBKB-NEXT: ret
;
; RV64ZBKB-LABEL: pr55484:
; RV64ZBKB: # %bb.0:
-; RV64ZBKB-NEXT: slli a1, a0, 40
-; RV64ZBKB-NEXT: slli a0, a0, 56
-; RV64ZBKB-NEXT: or a0, a0, a1
+; RV64ZBKB-NEXT: slli a1, a0, 56
+; RV64ZBKB-NEXT: srli a0, a0, 8
+; RV64ZBKB-NEXT: slli a0, a0, 48
+; RV64ZBKB-NEXT: or a0, a1, a0
; RV64ZBKB-NEXT: srai a0, a0, 48
; RV64ZBKB-NEXT: ret
%2 = lshr i32 %0, 8
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 549d531e829ea5..2baa13c2dbd52e 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -1233,7 +1233,8 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: ret
; RV64I-NEXT: .LBB10_2:
; RV64I-NEXT: li a0, 32
@@ -1870,7 +1871,8 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: ret
;
; RV32M-LABEL: test_ctlz_i32_zero_undef:
@@ -2453,7 +2455,8 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: ret
;
; RV32M-LABEL: test_ctpop_i32:
@@ -2561,7 +2564,8 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
; RV64XTHEADBB-NEXT: add a0, a0, a1
; RV64XTHEADBB-NEXT: slli a1, a0, 16
; RV64XTHEADBB-NEXT: add a0, a0, a1
-; RV64XTHEADBB-NEXT: srliw a0, a0, 24
+; RV64XTHEADBB-NEXT: slli a0, a0, 34
+; RV64XTHEADBB-NEXT: srli a0, a0, 58
; RV64XTHEADBB-NEXT: ret
%1 = call i32 @llvm.ctpop.i32(i32 %a)
ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 91ac7c5ddae3ff..08fa5ff7ba51ad 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -57,6 +57,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV64IM-NEXT: subw a0, a0, a1
; RV64IM-NEXT: srliw a0, a0, 1
; RV64IM-NEXT: add a0, a0, a1
+; RV64IM-NEXT: slli a0, a0, 0
; RV64IM-NEXT: srli a0, a0, 2
; RV64IM-NEXT: ret
;
@@ -70,6 +71,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV64IMZB-NEXT: subw a0, a0, a1
; RV64IMZB-NEXT: srliw a0, a0, 1
; RV64IMZB-NEXT: add a0, a0, a1
+; RV64IMZB-NEXT: slli a0, a0, 0
; RV64IMZB-NEXT: srli a0, a0, 2
; RV64IMZB-NEXT: ret
%1 = udiv i32 %a, 7
@@ -77,29 +79,52 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
}
define i64 @udiv64_constant_no_add(i64 %a) nounwind {
-; RV32-LABEL: udiv64_constant_no_add:
-; RV32: # %bb.0:
-; RV32-NEXT: add a2, a0, a1
-; RV32-NEXT: sltu a3, a2, a0
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 838861
-; RV32-NEXT: addi a4, a3, -819
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 2
-; RV32-NEXT: andi a5, a5, -4
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -820
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
-; RV32-NEXT: ret
+; RV32IM-LABEL: udiv64_constant_no_add:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: add a2, a0, a1
+; RV32IM-NEXT: sltu a3, a2, a0
+; RV32IM-NEXT: add a2, a2, a3
+; RV32IM-NEXT: lui a3, 838861
+; RV32IM-NEXT: addi a4, a3, -819
+; RV32IM-NEXT: mulhu a5, a2, a4
+; RV32IM-NEXT: srli a5, a5, 2
+; RV32IM-NEXT: slli a6, a5, 2
+; RV32IM-NEXT: add a5, a6, a5
+; RV32IM-NEXT: sub a2, a2, a5
+; RV32IM-NEXT: sub a5, a0, a2
+; RV32IM-NEXT: addi a3, a3, -820
+; RV32IM-NEXT: mul a3, a5, a3
+; RV32IM-NEXT: mulhu a6, a5, a4
+; RV32IM-NEXT: add a3, a6, a3
+; RV32IM-NEXT: sltu a0, a0, a2
+; RV32IM-NEXT: sub a1, a1, a0
+; RV32IM-NEXT: mul a1, a1, a4
+; RV32IM-NEXT: add a1, a3, a1
+; RV32IM-NEXT: mul a0, a5, a4
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: udiv64_constant_no_add:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: add a2, a0, a1
+; RV32IMZB-NEXT: sltu a3, a2, a0
+; RV32IMZB-NEXT: add a2, a2, a3
+; RV32IMZB-NEXT: lui a3, 838861
+; RV32IMZB-NEXT: addi a4, a3, -819
+; RV32IMZB-NEXT: mulhu a5, a2, a4
+; RV32IMZB-NEXT: srli a5, a5, 2
+; RV32IMZB-NEXT: sh2add a5, a5, a5
+; RV32IMZB-NEXT: sub a2, a2, a5
+; RV32IMZB-NEXT: sub a5, a0, a2
+; RV32IMZB-NEXT: addi a3, a3, -820
+; RV32IMZB-NEXT: mul a3, a5, a3
+; RV32IMZB-NEXT: mulhu a6, a5, a4
+; RV32IMZB-NEXT: add a3, a6, a3
+; RV32IMZB-NEXT: sltu a0, a0, a2
+; RV32IMZB-NEXT: sub a1, a1, a0
+; RV32IMZB-NEXT: mul a1, a1, a4
+; RV32IMZB-NEXT: add a1, a3, a1
+; RV32IMZB-NEXT: mul a0, a5, a4
+; RV32IMZB-NEXT: ret
;
; RV64-LABEL: udiv64_constant_no_add:
; RV64: # %bb.0:
@@ -488,10 +513,10 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
; RV32IM-NEXT: srai a0, a0, 24
; RV32IM-NEXT: li a1, 86
; RV32IM-NEXT: mul a0, a0, a1
-; RV32IM-NEXT: srli a1, a0, 8
-; RV32IM-NEXT: slli a0, a0, 16
-; RV32IM-NEXT: srli a0, a0, 31
-; RV32IM-NEXT: add a0, a1, a0
+; RV32IM-NEXT: srli a0, a0, 8
+; RV32IM-NEXT: slli a1, a0, 24
+; RV32IM-NEXT: srli a1, a1, 31
+; RV32IM-NEXT: add a0, a0, a1
; RV32IM-NEXT: ret
;
; RV32IMZB-LABEL: sdiv8_constant_no_srai:
@@ -499,10 +524,10 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
; RV32IMZB-NEXT: sext.b a0, a0
; RV32IMZB-NEXT: li a1, 86
; RV32IMZB-NEXT: mul a0, a0, a1
-; RV32IMZB-NEXT: srli a1, a0, 8
-; RV32IMZB-NEXT: slli a0, a0, 16
-; RV32IMZB-NEXT: srli a0, a0, 31
-; RV32IMZB-NEXT: add a0, a1, a0
+; RV32IMZB-NEXT: srli a0, a0, 8
+; RV32IMZB-NEXT: slli a1, a0, 24
+; RV32IMZB-NEXT: srli a1, a1, 31
+; RV32IMZB-NEXT: add a0, a0, a1
; RV32IMZB-NEXT: ret
;
; RV64IM-LABEL: sdiv8_constant_no_srai:
@@ -511,10 +536,10 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
; RV64IM-NEXT: srai a0, a0, 56
; RV64IM-NEXT: li a1, 86
; RV64IM-NEXT: mul a0, a0, a1
-; RV64IM-NEXT: srli a1, a0, 8
-; RV64IM-NEXT: slli a0, a0, 48
-; RV64IM-NEXT: srli a0, a0, 63
-; RV64IM-NEXT: add a0, a1, a0
+; RV64IM-NEXT: srli a0, a0, 8
+; RV64IM-NEXT: slli a1, a0, 56
+; RV64IM-NEXT: srli a1, a1, 63
+; RV64IM-NEXT: add a0, a0, a1
; RV64IM-NEXT: ret
;
; RV64IMZB-LABEL: sdiv8_constant_no_srai:
@@ -522,10 +547,10 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
; RV64IMZB-NEXT: sext.b a0, a0
; RV64IMZB-NEXT: li a1, 86
; RV64IMZB-NEXT: mul a0, a0, a1
-; RV64IMZB-NEXT: srli a1, a0, 8
-; RV64IMZB-NEXT: slli a0, a0, 48
-; RV64IMZB-NEXT: srli a0, a0, 63
-; RV64IMZB-NEXT: add a0, a1, a0
+; RV64IMZB-NEXT: srli a0, a0, 8
+; RV64IMZB-NEXT: slli a1, a0, 56
+; RV64IMZB-NEXT: srli a1, a1, 63
+; RV64IMZB-NEXT: add a0, a0, a1
; RV64IMZB-NEXT: ret
%1 = sdiv i8 %a, 3
ret i8 %1
@@ -539,7 +564,8 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
; RV32IM-NEXT: li a1, 103
; RV32IM-NEXT: mul a0, a0, a1
; RV32IM-NEXT: srai a1, a0, 9
-; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srli a0, a0, 8
+; RV32IM-NEXT: slli a0, a0, 24
; RV32IM-NEXT: srli a0, a0, 31
; RV32IM-NEXT: add a0, a1, a0
; RV32IM-NEXT: ret
@@ -550,7 +576,8 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
; RV32IMZB-NEXT: li a1, 103
; RV32IMZB-NEXT: mul a0, a0, a1
; RV32IMZB-NEXT: srai a1, a0, 9
-; RV32IMZB-NEXT: slli a0, a0, 16
+; RV32IMZB-NEXT: srli a0, a0, 8
+; RV32IMZB-NEXT: slli a0, a0, 24
; RV32IMZB-NEXT: srli a0, a0, 31
; RV32IMZB-NEXT: add a0, a1, a0
; RV32IMZB-NEXT: ret
@@ -562,7 +589,8 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
; RV64IM-NEXT: li a1, 103
; RV64IM-NEXT: mul a0, a0, a1
; RV64IM-NEXT: srai a1, a0, 9
-; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srli a0, a0, 8
+; RV64IM-NEXT: slli a0, a0, 56
; RV64IM-NEXT: srli a0, a0, 63
; RV64IM-NEXT: add a0, a1, a0
; RV64IM-NEXT: ret
@@ -573,7 +601,8 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
; RV64IMZB-NEXT: li a1, 103
; RV64IMZB-NEXT: mul a0, a0, a1
; RV64IMZB-NEXT: srai a1, a0, 9
-; RV64IMZB-NEXT: slli a0, a0, 48
+; RV64IMZB-NEXT: srli a0, a0, 8
+; RV64IMZB-NEXT: slli a0, a0, 56
; RV64IMZB-NEXT: srli a0, a0, 63
; RV64IMZB-NEXT: add a0, a1, a0
; RV64IMZB-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll
index 99c83b99497dd3..62b098d972e541 100644
--- a/llvm/test/CodeGen/RISCV/div.ll
+++ b/llvm/test/CodeGen/RISCV/div.ll
@@ -187,9 +187,9 @@ define i64 @udiv64_constant(i64 %a) nounwind {
; RV32IM-NEXT: lui a3, 838861
; RV32IM-NEXT: addi a4, a3, -819
; RV32IM-NEXT: mulhu a5, a2, a4
-; RV32IM-NEXT: srli a6, a5, 2
-; RV32IM-NEXT: andi a5, a5, -4
-; RV32IM-NEXT: add a5, a5, a6
+; RV32IM-NEXT: srli a5, a5, 2
+; RV32IM-NEXT: slli a6, a5, 2
+; RV32IM-NEXT: add a5, a6, a5
; RV32IM-NEXT: sub a2, a2, a5
; RV32IM-NEXT: sub a5, a0, a2
; RV32IM-NEXT: addi a3, a3, -820
@@ -981,7 +981,8 @@ define i8 @sdiv8_constant(i8 %a) nounwind {
; RV32IM-NEXT: li a1, 103
; RV32IM-NEXT: mul a0, a0, a1
; RV32IM-NEXT: srai a1, a0, 9
-; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srli a0, a0, 8
+; RV32IM-NEXT: slli a0, a0, 24
; RV32IM-NEXT: srli a0, a0, 31
; RV32IM-NEXT: add a0, a1, a0
; RV32IM-NEXT: ret
@@ -1005,7 +1006,8 @@ define i8 @sdiv8_constant(i8 %a) nounwind {
; RV64IM-NEXT: li a1, 103
; RV64IM-NEXT: mul a0, a0, a1
; RV64IM-NEXT: srai a1, a0, 9
-; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srli a0, a0, 8
+; RV64IM-NEXT: slli a0, a0, 56
; RV64IM-NEXT: srli a0, a0, 63
; RV64IM-NEXT: add a0, a1, a0
; RV64IM-NEXT: ret
@@ -1018,7 +1020,8 @@ define i8 @sdiv8_pow2(i8 %a) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: slli a1, a0, 24
; RV32I-NEXT: srai a1, a1, 24
-; RV32I-NEXT: slli a1, a1, 17
+; RV32I-NEXT: srli a1, a1, 7
+; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: srli a1, a1, 29
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: slli a0, a0, 24
@@ -1029,7 +1032,8 @@ define i8 @sdiv8_pow2(i8 %a) nounwind {
; RV32IM: # %bb.0:
; RV32IM-NEXT: slli a1, a0, 24
; RV32IM-NEXT: srai a1, a1, 24
-; RV32IM-NEXT: slli a1, a1, 17
+; RV32IM-NEXT: srli a1, a1, 7
+; RV32IM-NEXT: slli a1, a1, 24
; RV32IM-NEXT: srli a1, a1, 29
; RV32IM-NEXT: add a0, a0, a1
; RV32IM-NEXT: slli a0, a0, 24
@@ -1040,7 +1044,8 @@ define i8 @sdiv8_pow2(i8 %a) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: slli a1, a0, 56
; RV64I-NEXT: srai a1, a1, 56
-; RV64I-NEXT: slli a1, a1, 49
+; RV64I-NEXT: srli a1, a1, 7
+; RV64I-NEXT: slli a1, a1, 56
; RV64I-NEXT: srli a1, a1, 61
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a0, a0, 56
@@ -1051,7 +1056,8 @@ define i8 @sdiv8_pow2(i8 %a) nounwind {
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 56
; RV64IM-NEXT: srai a1, a1, 56
-; RV64IM-NEXT: slli a1, a1, 49
+; RV64IM-NEXT: srli a1, a1, 7
+; RV64IM-NEXT: slli a1, a1, 56
; RV64IM-NEXT: srli a1, a1, 61
; RV64IM-NEXT: add a0, a0, a1
; RV64IM-NEXT: slli a0, a0, 56
diff --git a/llvm/test/CodeGen/RISCV/float-intrinsics.ll b/llvm/test/CodeGen/RISCV/float-intrinsics.ll
index a00d82942cabe8..72d733842f25c7 100644
--- a/llvm/test/CodeGen/RISCV/float-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/float-intrinsics.ll
@@ -1755,25 +1755,29 @@ define i1 @isqnan_fpclass(float %x) {
; RV32IF-LABEL: isqnan_fpclass:
; RV32IF: # %bb.0:
; RV32IF-NEXT: fclass.s a0, fa0
-; RV32IF-NEXT: srli a0, a0, 9
+; RV32IF-NEXT: slli a0, a0, 22
+; RV32IF-NEXT: srli a0, a0, 31
; RV32IF-NEXT: ret
;
; RV32IZFINX-LABEL: isqnan_fpclass:
; RV32IZFINX: # %bb.0:
; RV32IZFINX-NEXT: fclass.s a0, a0
-; RV32IZFINX-NEXT: srli a0, a0, 9
+; RV32IZFINX-NEXT: slli a0, a0, 22
+; RV32IZFINX-NEXT: srli a0, a0, 31
; RV32IZFINX-NEXT: ret
;
; RV64IF-LABEL: isqnan_fpclass:
; RV64IF: # %bb.0:
; RV64IF-NEXT: fclass.s a0, fa0
-; RV64IF-NEXT: srli a0, a0, 9
+; RV64IF-NEXT: slli a0, a0, 54
+; RV64IF-NEXT: srli a0, a0, 63
; RV64IF-NEXT: ret
;
; RV64IZFINX-LABEL: isqnan_fpclass:
; RV64IZFINX: # %bb.0:
; RV64IZFINX-NEXT: fclass.s a0, a0
-; RV64IZFINX-NEXT: srli a0, a0, 9
+; RV64IZFINX-NEXT: slli a0, a0, 54
+; RV64IZFINX-NEXT: srli a0, a0, 63
; RV64IZFINX-NEXT: ret
;
; RV32I-LABEL: isqnan_fpclass:
diff --git a/llvm/test/CodeGen/RISCV/pr65025.ll b/llvm/test/CodeGen/RISCV/pr65025.ll
index dcd71edc460b8d..0189c873d48d46 100644
--- a/llvm/test/CodeGen/RISCV/pr65025.ll
+++ b/llvm/test/CodeGen/RISCV/pr65025.ll
@@ -5,7 +5,8 @@ define ptr @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %va
; CHECK-LABEL: cmpxchg_masked_and_branch1:
; CHECK: # %bb.0: # %do_cmpxchg
; CHECK-NEXT: andi a3, a0, -4
-; CHECK-NEXT: slli a4, a0, 3
+; CHECK-NEXT: andi a4, a0, 3
+; CHECK-NEXT: slli a4, a4, 3
; CHECK-NEXT: li a5, 255
; CHECK-NEXT: sllw a5, a5, a4
; CHECK-NEXT: andi a1, a1, 255
diff --git a/llvm/test/CodeGen/RISCV/rem.ll b/llvm/test/CodeGen/RISCV/rem.ll
index 5b27c4129df6ad..3e0c4b864cbcad 100644
--- a/llvm/test/CodeGen/RISCV/rem.ll
+++ b/llvm/test/CodeGen/RISCV/rem.ll
@@ -108,7 +108,8 @@ define i32 @srem_pow2(i32 %a) nounwind {
; RV32I-NEXT: srai a1, a0, 31
; RV32I-NEXT: srli a1, a1, 29
; RV32I-NEXT: add a1, a0, a1
-; RV32I-NEXT: andi a1, a1, -8
+; RV32I-NEXT: srai a1, a1, 3
+; RV32I-NEXT: slli a1, a1, 3
; RV32I-NEXT: sub a0, a0, a1
; RV32I-NEXT: ret
;
@@ -117,7 +118,8 @@ define i32 @srem_pow2(i32 %a) nounwind {
; RV32IM-NEXT: srai a1, a0, 31
; RV32IM-NEXT: srli a1, a1, 29
; RV32IM-NEXT: add a1, a0, a1
-; RV32IM-NEXT: andi a1, a1, -8
+; RV32IM-NEXT: srai a1, a1, 3
+; RV32IM-NEXT: slli a1, a1, 3
; RV32IM-NEXT: sub a0, a0, a1
; RV32IM-NEXT: ret
;
@@ -126,7 +128,8 @@ define i32 @srem_pow2(i32 %a) nounwind {
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: srliw a1, a1, 29
; RV64I-NEXT: add a1, a0, a1
-; RV64I-NEXT: andi a1, a1, -8
+; RV64I-NEXT: srli a1, a1, 3
+; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: ret
;
@@ -135,7 +138,8 @@ define i32 @srem_pow2(i32 %a) nounwind {
; RV64IM-NEXT: sraiw a1, a0, 31
; RV64IM-NEXT: srliw a1, a1, 29
; RV64IM-NEXT: add a1, a0, a1
-; RV64IM-NEXT: andi a1, a1, -8
+; RV64IM-NEXT: srli a1, a1, 3
+; RV64IM-NEXT: slli a1, a1, 3
; RV64IM-NEXT: subw a0, a0, a1
; RV64IM-NEXT: ret
%1 = srem i32 %a, 8
@@ -148,8 +152,8 @@ define i32 @srem_pow2_2(i32 %a) nounwind {
; RV32I-NEXT: srai a1, a0, 31
; RV32I-NEXT: srli a1, a1, 16
; RV32I-NEXT: add a1, a0, a1
-; RV32I-NEXT: lui a2, 1048560
-; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: srai a1, a1, 16
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: sub a0, a0, a1
; RV32I-NEXT: ret
;
@@ -158,8 +162,8 @@ define i32 @srem_pow2_2(i32 %a) nounwind {
; RV32IM-NEXT: srai a1, a0, 31
; RV32IM-NEXT: srli a1, a1, 16
; RV32IM-NEXT: add a1, a0, a1
-; RV32IM-NEXT: lui a2, 1048560
-; RV32IM-NEXT: and a1, a1, a2
+; RV32IM-NEXT: srai a1, a1, 16
+; RV32IM-NEXT: slli a1, a1, 16
; RV32IM-NEXT: sub a0, a0, a1
; RV32IM-NEXT: ret
;
@@ -168,8 +172,8 @@ define i32 @srem_pow2_2(i32 %a) nounwind {
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: srliw a1, a1, 16
; RV64I-NEXT: add a1, a0, a1
-; RV64I-NEXT: lui a2, 1048560
-; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: srli a1, a1, 16
+; RV64I-NEXT: slli a1, a1, 16
; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: ret
;
@@ -178,8 +182,8 @@ define i32 @srem_pow2_2(i32 %a) nounwind {
; RV64IM-NEXT: sraiw a1, a0, 31
; RV64IM-NEXT: srliw a1, a1, 16
; RV64IM-NEXT: add a1, a0, a1
-; RV64IM-NEXT: lui a2, 1048560
-; RV64IM-NEXT: and a1, a1, a2
+; RV64IM-NEXT: srli a1, a1, 16
+; RV64IM-NEXT: slli a1, a1, 16
; RV64IM-NEXT: subw a0, a0, a1
; RV64IM-NEXT: ret
%1 = srem i32 %a, 65536
diff --git a/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll b/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll
index 957f44f9f669de..62c99dedbf2f6c 100644
--- a/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll
@@ -9,10 +9,14 @@ define void @test(ptr nocapture noundef writeonly %array1, i32 noundef signext %
; RV64-NEXT: slli a4, a3, 2
; RV64-NEXT: add a4, a0, a4
; RV64-NEXT: sw a2, 0(a4)
+; RV64-NEXT: addiw a4, a1, 6
+; RV64-NEXT: slli a4, a4, 2
+; RV64-NEXT: add a4, a0, a4
+; RV64-NEXT: sw a2, 0(a4)
+; RV64-NEXT: addiw a1, a1, 35
; RV64-NEXT: slli a1, a1, 2
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: sw a2, 24(a0)
-; RV64-NEXT: sw a3, 140(a0)
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: sw a3, 0(a0)
; RV64-NEXT: ret
entry:
%add = add nsw i32 %a, 5
@@ -35,18 +39,22 @@ define void @test1(ptr nocapture noundef %array1, i32 noundef signext %a, i32 no
; RV64-LABEL: test1:
; RV64: # %bb.0: # %entry
; RV64-NEXT: addiw a4, a1, 5
-; RV64-NEXT: slli a5, a4, 2
-; RV64-NEXT: add a5, a0, a5
-; RV64-NEXT: mv a6, a4
+; RV64-NEXT: slli a6, a4, 2
+; RV64-NEXT: add a6, a0, a6
+; RV64-NEXT: mv a5, a4
; RV64-NEXT: bgtz a3, .LBB1_2
; RV64-NEXT: # %bb.1: # %entry
-; RV64-NEXT: mv a6, a2
+; RV64-NEXT: mv a5, a2
; RV64-NEXT: .LBB1_2: # %entry
-; RV64-NEXT: sw a6, 0(a5)
+; RV64-NEXT: sw a5, 0(a6)
+; RV64-NEXT: addiw a2, a1, 6
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: sw a5, 0(a2)
+; RV64-NEXT: addiw a1, a1, 35
; RV64-NEXT: slli a1, a1, 2
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: sw a6, 24(a0)
-; RV64-NEXT: sw a4, 140(a0)
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: sw a4, 0(a0)
; RV64-NEXT: ret
entry:
%add = add nsw i32 %a, 5
diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
index d907a37c2b3d17..0d4ee4aef7b8eb 100644
--- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
@@ -1546,7 +1546,8 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
;
; RV32XTHEADBB-LABEL: rotl_64_mask_shared:
; RV32XTHEADBB: # %bb.0:
-; RV32XTHEADBB-NEXT: th.extu a5, a4, 5, 5
+; RV32XTHEADBB-NEXT: slli a5, a4, 26
+; RV32XTHEADBB-NEXT: srli a5, a5, 31
; RV32XTHEADBB-NEXT: mv a7, a0
; RV32XTHEADBB-NEXT: bnez a5, .LBB17_2
; RV32XTHEADBB-NEXT: # %bb.1:
@@ -2011,7 +2012,8 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
;
; RV32XTHEADBB-LABEL: rotl_64_mask_multiple:
; RV32XTHEADBB: # %bb.0:
-; RV32XTHEADBB-NEXT: th.extu a5, a4, 5, 5
+; RV32XTHEADBB-NEXT: slli a5, a4, 26
+; RV32XTHEADBB-NEXT: srli a5, a5, 31
; RV32XTHEADBB-NEXT: mv a6, a1
; RV32XTHEADBB-NEXT: bnez a5, .LBB21_2
; RV32XTHEADBB-NEXT: # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
index 17d9e9cefe117e..6a18ab6a2998f4 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
@@ -533,7 +533,8 @@ define i8 @sdiv8_constant(i8 %a) nounwind {
; RV64IM-NEXT: li a1, 103
; RV64IM-NEXT: mul a0, a0, a1
; RV64IM-NEXT: sraiw a1, a0, 9
-; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srliw a0, a0, 8
+; RV64IM-NEXT: slli a0, a0, 56
; RV64IM-NEXT: srli a0, a0, 63
; RV64IM-NEXT: addw a0, a1, a0
; RV64IM-NEXT: ret
@@ -546,7 +547,8 @@ define i8 @sdiv8_pow2(i8 %a) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: slli a1, a0, 24
; RV64I-NEXT: sraiw a1, a1, 24
-; RV64I-NEXT: slli a1, a1, 49
+; RV64I-NEXT: srliw a1, a1, 7
+; RV64I-NEXT: slli a1, a1, 56
; RV64I-NEXT: srli a1, a1, 61
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a0, a0, 24
@@ -557,7 +559,8 @@ define i8 @sdiv8_pow2(i8 %a) nounwind {
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 24
; RV64IM-NEXT: sraiw a1, a1, 24
-; RV64IM-NEXT: slli a1, a1, 49
+; RV64IM-NEXT: srliw a1, a1, 7
+; RV64IM-NEXT: slli a1, a1, 56
; RV64IM-NEXT: srli a1, a1, 61
; RV64IM-NEXT: add a0, a0, a1
; RV64IM-NEXT: slli a0, a0, 24
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll
index 9d7b77de03eefa..d3c933f77e3bf3 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll
@@ -74,7 +74,8 @@ define i32 @srem_pow2(i32 %a) nounwind {
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: srliw a1, a1, 29
; RV64I-NEXT: add a1, a0, a1
-; RV64I-NEXT: andi a1, a1, -8
+; RV64I-NEXT: sraiw a1, a1, 3
+; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: ret
;
@@ -83,7 +84,8 @@ define i32 @srem_pow2(i32 %a) nounwind {
; RV64IM-NEXT: sraiw a1, a0, 31
; RV64IM-NEXT: srliw a1, a1, 29
; RV64IM-NEXT: add a1, a0, a1
-; RV64IM-NEXT: andi a1, a1, -8
+; RV64IM-NEXT: sraiw a1, a1, 3
+; RV64IM-NEXT: slli a1, a1, 3
; RV64IM-NEXT: subw a0, a0, a1
; RV64IM-NEXT: ret
%1 = srem i32 %a, 8
@@ -96,8 +98,8 @@ define i32 @srem_pow2_2(i32 %a) nounwind {
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: srliw a1, a1, 16
; RV64I-NEXT: add a1, a0, a1
-; RV64I-NEXT: lui a2, 1048560
-; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: sraiw a1, a1, 16
+; RV64I-NEXT: slli a1, a1, 16
; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: ret
;
@@ -106,8 +108,8 @@ define i32 @srem_pow2_2(i32 %a) nounwind {
; RV64IM-NEXT: sraiw a1, a0, 31
; RV64IM-NEXT: srliw a1, a1, 16
; RV64IM-NEXT: add a1, a0, a1
-; RV64IM-NEXT: lui a2, 1048560
-; RV64IM-NEXT: and a1, a1, a2
+; RV64IM-NEXT: sraiw a1, a1, 16
+; RV64IM-NEXT: slli a1, a1, 16
; RV64IM-NEXT: subw a0, a0, a1
; RV64IM-NEXT: ret
%1 = srem i32 %a, 65536
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
index c3ae40124ba04b..bc4e6da1215e20 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
@@ -1358,10 +1358,9 @@ define zeroext i32 @sext_ashr_zext_i16(i16 %a) nounwind {
define signext i16 @sh1adduw_ptrdiff(i64 %diff, ptr %baseptr) {
; RV64I-LABEL: sh1adduw_ptrdiff:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a2, 1
-; RV64I-NEXT: slli a2, a2, 33
-; RV64I-NEXT: addi a2, a2, -2
-; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 31
; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: lh a0, 0(a0)
; RV64I-NEXT: ret
@@ -1382,10 +1381,9 @@ define signext i16 @sh1adduw_ptrdiff(i64 %diff, ptr %baseptr) {
define signext i32 @sh2adduw_ptrdiff(i64 %diff, ptr %baseptr) {
; RV64I-LABEL: sh2adduw_ptrdiff:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a2, 1
-; RV64I-NEXT: slli a2, a2, 34
-; RV64I-NEXT: addi a2, a2, -4
-; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 30
; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: lw a0, 0(a0)
; RV64I-NEXT: ret
@@ -1406,10 +1404,9 @@ define signext i32 @sh2adduw_ptrdiff(i64 %diff, ptr %baseptr) {
define i64 @sh3adduw_ptrdiff(i64 %diff, ptr %baseptr) {
; RV64I-LABEL: sh3adduw_ptrdiff:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a2, 1
-; RV64I-NEXT: slli a2, a2, 35
-; RV64I-NEXT: addi a2, a2, -8
-; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a0, a0, 3
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 29
; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: ld a0, 0(a0)
; RV64I-NEXT: ret
@@ -1431,7 +1428,8 @@ define signext i16 @srliw_1_sh1add(ptr %0, i32 signext %1) {
; RV64I-LABEL: srliw_1_sh1add:
; RV64I: # %bb.0:
; RV64I-NEXT: srliw a1, a1, 1
-; RV64I-NEXT: slli a1, a1, 1
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: srli a1, a1, 31
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lh a0, 0(a0)
; RV64I-NEXT: ret
@@ -1452,10 +1450,9 @@ define signext i16 @srliw_1_sh1add(ptr %0, i32 signext %1) {
define i128 @slliuw_ptrdiff(i64 %diff, ptr %baseptr) {
; RV64I-LABEL: slliuw_ptrdiff:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a2, 1
-; RV64I-NEXT: slli a2, a2, 36
-; RV64I-NEXT: addi a2, a2, -16
-; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a0, a0, 4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 28
; RV64I-NEXT: add a1, a1, a0
; RV64I-NEXT: ld a0, 0(a1)
; RV64I-NEXT: ld a1, 8(a1)
@@ -1480,7 +1477,8 @@ define signext i32 @srliw_2_sh2add(ptr %0, i32 signext %1) {
; RV64I-LABEL: srliw_2_sh2add:
; RV64I: # %bb.0:
; RV64I-NEXT: srliw a1, a1, 2
-; RV64I-NEXT: slli a1, a1, 2
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: srli a1, a1, 30
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lw a0, 0(a0)
; RV64I-NEXT: ret
@@ -1502,7 +1500,8 @@ define i64 @srliw_3_sh3add(ptr %0, i32 signext %1) {
; RV64I-LABEL: srliw_3_sh3add:
; RV64I: # %bb.0:
; RV64I-NEXT: srliw a1, a1, 3
-; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: srli a1, a1, 29
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: ld a0, 0(a0)
; RV64I-NEXT: ret
@@ -1704,8 +1703,8 @@ define i64 @srli_2_sh3add(ptr %0, i64 %1) {
define signext i16 @srli_2_sh1add(ptr %0, i64 %1) {
; RV64I-LABEL: srli_2_sh1add:
; RV64I: # %bb.0:
-; RV64I-NEXT: srli a1, a1, 1
-; RV64I-NEXT: andi a1, a1, -2
+; RV64I-NEXT: srli a1, a1, 2
+; RV64I-NEXT: slli a1, a1, 1
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lh a0, 0(a0)
; RV64I-NEXT: ret
@@ -1725,8 +1724,8 @@ define signext i16 @srli_2_sh1add(ptr %0, i64 %1) {
define signext i32 @srli_3_sh2add(ptr %0, i64 %1) {
; RV64I-LABEL: srli_3_sh2add:
; RV64I: # %bb.0:
-; RV64I-NEXT: srli a1, a1, 1
-; RV64I-NEXT: andi a1, a1, -4
+; RV64I-NEXT: srli a1, a1, 3
+; RV64I-NEXT: slli a1, a1, 2
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lw a0, 0(a0)
; RV64I-NEXT: ret
@@ -1746,8 +1745,8 @@ define signext i32 @srli_3_sh2add(ptr %0, i64 %1) {
define i64 @srli_4_sh3add(ptr %0, i64 %1) {
; RV64I-LABEL: srli_4_sh3add:
; RV64I: # %bb.0:
-; RV64I-NEXT: srli a1, a1, 1
-; RV64I-NEXT: andi a1, a1, -8
+; RV64I-NEXT: srli a1, a1, 4
+; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: ld a0, 0(a0)
; RV64I-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
index f38aa71fb158d0..065a6c529323c6 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
@@ -7,8 +7,8 @@
define i64 @test1(i64 %a) nounwind {
; RV64I-LABEL: test1:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: slli a0, a0, 2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srai a0, a0, 30
; RV64I-NEXT: ret
%1 = shl i64 %a, 32
%2 = ashr i64 %1, 30
@@ -18,7 +18,8 @@ define i64 @test1(i64 %a) nounwind {
define i64 @test2(i32 signext %a) nounwind {
; RV64I-LABEL: test2:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a0, a0, 3
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srai a0, a0, 29
; RV64I-NEXT: ret
%1 = zext i32 %a to i64
%2 = shl i64 %1, 32
@@ -29,8 +30,9 @@ define i64 @test2(i32 signext %a) nounwind {
define i64 @test3(ptr %a) nounwind {
; RV64I-LABEL: test3:
; RV64I: # %bb.0:
-; RV64I-NEXT: lw a0, 0(a0)
-; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: lwu a0, 0(a0)
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srai a0, a0, 28
; RV64I-NEXT: ret
%1 = load i32, ptr %a
%2 = zext i32 %1 to i64
@@ -42,8 +44,9 @@ define i64 @test3(ptr %a) nounwind {
define i64 @test4(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: test4:
; RV64I: # %bb.0:
-; RV64I-NEXT: addw a0, a0, a1
-; RV64I-NEXT: slli a0, a0, 30
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srai a0, a0, 2
; RV64I-NEXT: ret
%1 = add i32 %a, %b
%2 = zext i32 %1 to i64
@@ -56,7 +59,8 @@ define i64 @test5(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: test5:
; RV64I: # %bb.0:
; RV64I-NEXT: xor a0, a0, a1
-; RV64I-NEXT: slli a0, a0, 31
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srai a0, a0, 1
; RV64I-NEXT: ret
%1 = xor i32 %a, %b
%2 = zext i32 %1 to i64
@@ -69,7 +73,8 @@ define i64 @test6(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: test6:
; RV64I: # %bb.0:
; RV64I-NEXT: sllw a0, a0, a1
-; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srai a0, a0, 16
; RV64I-NEXT: ret
%1 = shl i32 %a, %b
%2 = zext i32 %1 to i64
@@ -163,8 +168,8 @@ define i64 @test11(ptr %0, i64 %1) {
define i32 @test12(i32 signext %0) {
; RV64I-LABEL: test12:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a0, a0, 49
-; RV64I-NEXT: srai a0, a0, 47
+; RV64I-NEXT: slli a0, a0, 17
+; RV64I-NEXT: sraiw a0, a0, 15
; RV64I-NEXT: ret
%2 = shl i32 %0, 17
%3 = ashr i32 %2, 15
diff --git a/llvm/test/CodeGen/RISCV/rv64i-tricky-shifts.ll b/llvm/test/CodeGen/RISCV/rv64i-tricky-shifts.ll
index b01833152ef419..cc4978edf10245 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-tricky-shifts.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-tricky-shifts.ll
@@ -11,7 +11,8 @@ define i64 @tricky_shl(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: tricky_shl:
; RV64I: # %bb.0:
; RV64I-NEXT: sll a0, a0, a1
-; RV64I-NEXT: sext.w a0, a0
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srai a0, a0, 32
; RV64I-NEXT: ret
%1 = shl i64 %a, %b
%2 = shl i64 %1, 32
@@ -34,7 +35,8 @@ define i64 @tricky_lshr(i64 %a, i64 %b) nounwind {
define i64 @tricky_ashr(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: tricky_ashr:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srai a0, a0, 32
; RV64I-NEXT: sra a0, a0, a1
; RV64I-NEXT: ret
%1 = shl i64 %a, 32
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
index 6cdab888ffcde7..e0f739807cee52 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
@@ -42,7 +42,8 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: ret
; RV64I-NEXT: .LBB0_2:
; RV64I-NEXT: li a0, 32
@@ -94,7 +95,8 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: j .LBB1_3
; RV64I-NEXT: .LBB1_2:
; RV64I-NEXT: li a0, 32
@@ -155,7 +157,8 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a1, 16
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: srliw a2, a1, 24
+; RV64I-NEXT: slli a1, a1, 34
+; RV64I-NEXT: srli a2, a1, 58
; RV64I-NEXT: .LBB2_2: # %cond.end
; RV64I-NEXT: sub a0, a0, a2
; RV64I-NEXT: ret
@@ -209,7 +212,8 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a1, 16
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: srliw a1, a1, 24
+; RV64I-NEXT: slli a1, a1, 34
+; RV64I-NEXT: srli a1, a1, 58
; RV64I-NEXT: xori a1, a1, 31
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: addi a0, a0, -1
@@ -270,7 +274,8 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: ret
; RV64I-NEXT: .LBB4_2:
; RV64I-NEXT: li a0, 32
@@ -609,14 +614,14 @@ define signext i32 @sexth_i32(i32 signext %a) nounwind {
define signext i32 @no_sexth_i32(i32 signext %a) nounwind {
; RV64I-LABEL: no_sexth_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a0, a0, 49
-; RV64I-NEXT: srai a0, a0, 48
+; RV64I-NEXT: slli a0, a0, 17
+; RV64I-NEXT: sraiw a0, a0, 16
; RV64I-NEXT: ret
;
; RV64XTHEADBB-LABEL: no_sexth_i32:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: slli a0, a0, 49
-; RV64XTHEADBB-NEXT: srai a0, a0, 48
+; RV64XTHEADBB-NEXT: slli a0, a0, 17
+; RV64XTHEADBB-NEXT: sraiw a0, a0, 16
; RV64XTHEADBB-NEXT: ret
%shl = shl i32 %a, 17
%shr = ashr exact i32 %shl, 16
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 817e2b7d0bd993..e5801261b017f3 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -1703,10 +1703,9 @@ define zeroext i32 @sext_ashr_zext_i16(i16 %a) nounwind {
define signext i16 @sh1adduw_ptrdiff(i64 %diff, ptr %baseptr) {
; RV64I-LABEL: sh1adduw_ptrdiff:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a2, 1
-; RV64I-NEXT: slli a2, a2, 33
-; RV64I-NEXT: addi a2, a2, -2
-; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 31
; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: lh a0, 0(a0)
; RV64I-NEXT: ret
@@ -1727,10 +1726,9 @@ define signext i16 @sh1adduw_ptrdiff(i64 %diff, ptr %baseptr) {
define signext i32 @sh2adduw_ptrdiff(i64 %diff, ptr %baseptr) {
; RV64I-LABEL: sh2adduw_ptrdiff:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a2, 1
-; RV64I-NEXT: slli a2, a2, 34
-; RV64I-NEXT: addi a2, a2, -4
-; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 30
; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: lw a0, 0(a0)
; RV64I-NEXT: ret
@@ -1751,10 +1749,9 @@ define signext i32 @sh2adduw_ptrdiff(i64 %diff, ptr %baseptr) {
define i64 @sh3adduw_ptrdiff(i64 %diff, ptr %baseptr) {
; RV64I-LABEL: sh3adduw_ptrdiff:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a2, 1
-; RV64I-NEXT: slli a2, a2, 35
-; RV64I-NEXT: addi a2, a2, -8
-; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a0, a0, 3
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 29
; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: ld a0, 0(a0)
; RV64I-NEXT: ret
@@ -1797,10 +1794,9 @@ define signext i16 @srliw_1_sh1add(ptr %0, i32 signext %1) {
define i128 @slliuw_ptrdiff(i64 %diff, ptr %baseptr) {
; RV64I-LABEL: slliuw_ptrdiff:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a2, 1
-; RV64I-NEXT: slli a2, a2, 36
-; RV64I-NEXT: addi a2, a2, -16
-; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a0, a0, 4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 28
; RV64I-NEXT: add a1, a1, a0
; RV64I-NEXT: ld a0, 0(a1)
; RV64I-NEXT: ld a1, 8(a1)
@@ -2043,8 +2039,8 @@ define i64 @srli_2_sh3add(ptr %0, i64 %1) {
define signext i16 @srli_2_sh1add(ptr %0, i64 %1) {
; RV64I-LABEL: srli_2_sh1add:
; RV64I: # %bb.0:
-; RV64I-NEXT: srli a1, a1, 1
-; RV64I-NEXT: andi a1, a1, -2
+; RV64I-NEXT: srli a1, a1, 2
+; RV64I-NEXT: slli a1, a1, 1
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lh a0, 0(a0)
; RV64I-NEXT: ret
@@ -2064,8 +2060,8 @@ define signext i16 @srli_2_sh1add(ptr %0, i64 %1) {
define signext i32 @srli_3_sh2add(ptr %0, i64 %1) {
; RV64I-LABEL: srli_3_sh2add:
; RV64I: # %bb.0:
-; RV64I-NEXT: srli a1, a1, 1
-; RV64I-NEXT: andi a1, a1, -4
+; RV64I-NEXT: srli a1, a1, 3
+; RV64I-NEXT: slli a1, a1, 2
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lw a0, 0(a0)
; RV64I-NEXT: ret
@@ -2085,8 +2081,8 @@ define signext i32 @srli_3_sh2add(ptr %0, i64 %1) {
define i64 @srli_4_sh3add(ptr %0, i64 %1) {
; RV64I-LABEL: srli_4_sh3add:
; RV64I: # %bb.0:
-; RV64I-NEXT: srli a1, a1, 1
-; RV64I-NEXT: andi a1, a1, -8
+; RV64I-NEXT: srli a1, a1, 4
+; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: ld a0, 0(a0)
; RV64I-NEXT: ret
@@ -2106,7 +2102,8 @@ define i64 @srli_4_sh3add(ptr %0, i64 %1) {
define signext i16 @shl_2_sh1add(ptr %0, i32 signext %1) {
; RV64I-LABEL: shl_2_sh1add:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a1, a1, 34
+; RV64I-NEXT: slli a1, a1, 2
+; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: srli a1, a1, 31
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lh a0, 0(a0)
@@ -2128,7 +2125,8 @@ define signext i16 @shl_2_sh1add(ptr %0, i32 signext %1) {
define signext i32 @shl_16_sh2add(ptr %0, i32 signext %1) {
; RV64I-LABEL: shl_16_sh2add:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a1, a1, 48
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: srli a1, a1, 30
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lw a0, 0(a0)
@@ -2150,7 +2148,8 @@ define signext i32 @shl_16_sh2add(ptr %0, i32 signext %1) {
define i64 @shl_31_sh3add(ptr %0, i32 signext %1) {
; RV64I-LABEL: shl_31_sh3add:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a1, a1, 63
+; RV64I-NEXT: slli a1, a1, 31
+; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: srli a1, a1, 29
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: ld a0, 0(a0)
@@ -2611,7 +2610,7 @@ define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) {
; RV64ZBA-LABEL: test_gep_gep_dont_crash:
; RV64ZBA: # %bb.0:
; RV64ZBA-NEXT: srliw a2, a2, 6
-; RV64ZBA-NEXT: add a1, a2, a1
+; RV64ZBA-NEXT: sh3add a0, a2, a0
; RV64ZBA-NEXT: sh3add a0, a1, a0
; RV64ZBA-NEXT: ret
%lshr = lshr i64 %a2, 6
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 4d5ef5db86057b..3f0b25ecdcf473 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -42,7 +42,8 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: ret
; RV64I-NEXT: .LBB0_2:
; RV64I-NEXT: li a0, 32
@@ -92,7 +93,8 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: j .LBB1_3
; RV64I-NEXT: .LBB1_2:
; RV64I-NEXT: li a0, 32
@@ -151,7 +153,8 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a1, 16
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: srliw a2, a1, 24
+; RV64I-NEXT: slli a1, a1, 34
+; RV64I-NEXT: srli a2, a1, 58
; RV64I-NEXT: .LBB2_2: # %cond.end
; RV64I-NEXT: sub a0, a0, a2
; RV64I-NEXT: ret
@@ -203,7 +206,8 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a1, 16
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: srliw a1, a1, 24
+; RV64I-NEXT: slli a1, a1, 34
+; RV64I-NEXT: srli a1, a1, 58
; RV64I-NEXT: xori a1, a1, 31
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: addi a0, a0, -1
@@ -262,7 +266,8 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: ret
; RV64I-NEXT: .LBB4_2:
; RV64I-NEXT: li a0, 32
@@ -539,7 +544,8 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop_i32:
@@ -650,7 +656,8 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: slli a1, a0, 16
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop_i32_load:
@@ -688,7 +695,8 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
; RV64I-NEXT: add a0, a0, a5
; RV64I-NEXT: slli a5, a0, 16
; RV64I-NEXT: add a0, a0, a5
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: srli a0, a0, 58
; RV64I-NEXT: srli a5, a1, 1
; RV64I-NEXT: and a3, a5, a3
; RV64I-NEXT: sub a1, a1, a3
@@ -703,7 +711,8 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a1, 16
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: srliw a1, a1, 24
+; RV64I-NEXT: slli a1, a1, 34
+; RV64I-NEXT: srli a1, a1, 58
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop_v2i32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index e15e6452163b1c..398193f4810595 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -311,10 +311,11 @@ define <vscale x 1 x i8> @extract_nxv8i8_nxv1i8_7(<vscale x 8 x i8> %vec) {
; CHECK-LABEL: extract_nxv8i8_nxv1i8_7:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a1, a0, 3
-; CHECK-NEXT: sub a0, a0, a1
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: slli a1, a0, 3
+; CHECK-NEXT: sub a1, a1, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a1
; CHECK-NEXT: ret
%c = call <vscale x 1 x i8> @llvm.vector.extract.nxv1i8.nxv8i8(<vscale x 8 x i8> %vec, i64 7)
ret <vscale x 1 x i8> %c
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
index df9949e617b807..81acc359a265d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
@@ -10,6 +10,8 @@ define signext i8 @extractelt_nxv1i8_0(<vscale x 1 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i8> %v, i32 0
ret i8 %r
@@ -21,6 +23,8 @@ define signext i8 @extractelt_nxv1i8_imm(<vscale x 1 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i8> %v, i32 2
ret i8 %r
@@ -32,6 +36,8 @@ define signext i8 @extractelt_nxv1i8_idx(<vscale x 1 x i8> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i8> %v, i32 %idx
ret i8 %r
@@ -42,6 +48,8 @@ define signext i8 @extractelt_nxv2i8_0(<vscale x 2 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i8> %v, i32 0
ret i8 %r
@@ -53,6 +61,8 @@ define signext i8 @extractelt_nxv2i8_imm(<vscale x 2 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i8> %v, i32 2
ret i8 %r
@@ -64,6 +74,8 @@ define signext i8 @extractelt_nxv2i8_idx(<vscale x 2 x i8> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i8> %v, i32 %idx
ret i8 %r
@@ -74,6 +86,8 @@ define signext i8 @extractelt_nxv4i8_0(<vscale x 4 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i8> %v, i32 0
ret i8 %r
@@ -85,6 +99,8 @@ define signext i8 @extractelt_nxv4i8_imm(<vscale x 4 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i8> %v, i32 2
ret i8 %r
@@ -96,6 +112,8 @@ define signext i8 @extractelt_nxv4i8_idx(<vscale x 4 x i8> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i8> %v, i32 %idx
ret i8 %r
@@ -106,6 +124,8 @@ define signext i8 @extractelt_nxv8i8_0(<vscale x 8 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i8> %v, i32 0
ret i8 %r
@@ -117,6 +137,8 @@ define signext i8 @extractelt_nxv8i8_imm(<vscale x 8 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i8> %v, i32 2
ret i8 %r
@@ -128,6 +150,8 @@ define signext i8 @extractelt_nxv8i8_idx(<vscale x 8 x i8> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i8> %v, i32 %idx
ret i8 %r
@@ -138,6 +162,8 @@ define signext i8 @extractelt_nxv16i8_0(<vscale x 16 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i8> %v, i32 0
ret i8 %r
@@ -149,6 +175,8 @@ define signext i8 @extractelt_nxv16i8_imm(<vscale x 16 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i8> %v, i32 2
ret i8 %r
@@ -160,6 +188,8 @@ define signext i8 @extractelt_nxv16i8_idx(<vscale x 16 x i8> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i8> %v, i32 %idx
ret i8 %r
@@ -170,6 +200,8 @@ define signext i8 @extractelt_nxv32i8_0(<vscale x 32 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i8> %v, i32 0
ret i8 %r
@@ -181,6 +213,8 @@ define signext i8 @extractelt_nxv32i8_imm(<vscale x 32 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i8> %v, i32 2
ret i8 %r
@@ -192,6 +226,8 @@ define signext i8 @extractelt_nxv32i8_idx(<vscale x 32 x i8> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e8, m4, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i8> %v, i32 %idx
ret i8 %r
@@ -202,6 +238,8 @@ define signext i8 @extractelt_nxv64i8_0(<vscale x 64 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 64 x i8> %v, i32 0
ret i8 %r
@@ -213,6 +251,8 @@ define signext i8 @extractelt_nxv64i8_imm(<vscale x 64 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 64 x i8> %v, i32 2
ret i8 %r
@@ -224,6 +264,8 @@ define signext i8 @extractelt_nxv64i8_idx(<vscale x 64 x i8> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: srai a0, a0, 24
; CHECK-NEXT: ret
%r = extractelement <vscale x 64 x i8> %v, i32 %idx
ret i8 %r
@@ -234,6 +276,8 @@ define signext i16 @extractelt_nxv1i16_0(<vscale x 1 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i16> %v, i32 0
ret i16 %r
@@ -245,6 +289,8 @@ define signext i16 @extractelt_nxv1i16_imm(<vscale x 1 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i16> %v, i32 2
ret i16 %r
@@ -256,6 +302,8 @@ define signext i16 @extractelt_nxv1i16_idx(<vscale x 1 x i16> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i16> %v, i32 %idx
ret i16 %r
@@ -266,6 +314,8 @@ define signext i16 @extractelt_nxv2i16_0(<vscale x 2 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i16> %v, i32 0
ret i16 %r
@@ -277,6 +327,8 @@ define signext i16 @extractelt_nxv2i16_imm(<vscale x 2 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i16> %v, i32 2
ret i16 %r
@@ -288,6 +340,8 @@ define signext i16 @extractelt_nxv2i16_idx(<vscale x 2 x i16> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i16> %v, i32 %idx
ret i16 %r
@@ -298,6 +352,8 @@ define signext i16 @extractelt_nxv4i16_0(<vscale x 4 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i16> %v, i32 0
ret i16 %r
@@ -309,6 +365,8 @@ define signext i16 @extractelt_nxv4i16_imm(<vscale x 4 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i16> %v, i32 2
ret i16 %r
@@ -320,6 +378,8 @@ define signext i16 @extractelt_nxv4i16_idx(<vscale x 4 x i16> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i16> %v, i32 %idx
ret i16 %r
@@ -330,6 +390,8 @@ define signext i16 @extractelt_nxv8i16_0(<vscale x 8 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i16> %v, i32 0
ret i16 %r
@@ -341,6 +403,8 @@ define signext i16 @extractelt_nxv8i16_imm(<vscale x 8 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i16> %v, i32 2
ret i16 %r
@@ -352,6 +416,8 @@ define signext i16 @extractelt_nxv8i16_idx(<vscale x 8 x i16> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i16> %v, i32 %idx
ret i16 %r
@@ -362,6 +428,8 @@ define signext i16 @extractelt_nxv16i16_0(<vscale x 16 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i16> %v, i32 0
ret i16 %r
@@ -373,6 +441,8 @@ define signext i16 @extractelt_nxv16i16_imm(<vscale x 16 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i16> %v, i32 2
ret i16 %r
@@ -384,6 +454,8 @@ define signext i16 @extractelt_nxv16i16_idx(<vscale x 16 x i16> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i16> %v, i32 %idx
ret i16 %r
@@ -394,6 +466,8 @@ define signext i16 @extractelt_nxv32i16_0(<vscale x 32 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i16> %v, i32 0
ret i16 %r
@@ -405,6 +479,8 @@ define signext i16 @extractelt_nxv32i16_imm(<vscale x 32 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i16> %v, i32 2
ret i16 %r
@@ -416,6 +492,8 @@ define signext i16 @extractelt_nxv32i16_idx(<vscale x 32 x i16> %v, i32 %idx) {
; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srai a0, a0, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i16> %v, i32 %idx
ret i16 %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
index a96cf5807e6c16..f8a1a9448c453f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
@@ -9,6 +9,8 @@ define signext i8 @extractelt_nxv1i8_0(<vscale x 1 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i8> %v, i32 0
ret i8 %r
@@ -20,6 +22,8 @@ define signext i8 @extractelt_nxv1i8_imm(<vscale x 1 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i8> %v, i32 2
ret i8 %r
@@ -31,6 +35,8 @@ define signext i8 @extractelt_nxv1i8_idx(<vscale x 1 x i8> %v, i32 zeroext %idx)
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i8> %v, i32 %idx
ret i8 %r
@@ -41,6 +47,8 @@ define signext i8 @extractelt_nxv2i8_0(<vscale x 2 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i8> %v, i32 0
ret i8 %r
@@ -52,6 +60,8 @@ define signext i8 @extractelt_nxv2i8_imm(<vscale x 2 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i8> %v, i32 2
ret i8 %r
@@ -63,6 +73,8 @@ define signext i8 @extractelt_nxv2i8_idx(<vscale x 2 x i8> %v, i32 zeroext %idx)
; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i8> %v, i32 %idx
ret i8 %r
@@ -73,6 +85,8 @@ define signext i8 @extractelt_nxv4i8_0(<vscale x 4 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i8> %v, i32 0
ret i8 %r
@@ -84,6 +98,8 @@ define signext i8 @extractelt_nxv4i8_imm(<vscale x 4 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i8> %v, i32 2
ret i8 %r
@@ -95,6 +111,8 @@ define signext i8 @extractelt_nxv4i8_idx(<vscale x 4 x i8> %v, i32 zeroext %idx)
; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i8> %v, i32 %idx
ret i8 %r
@@ -105,6 +123,8 @@ define signext i8 @extractelt_nxv8i8_0(<vscale x 8 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i8> %v, i32 0
ret i8 %r
@@ -116,6 +136,8 @@ define signext i8 @extractelt_nxv8i8_imm(<vscale x 8 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i8> %v, i32 2
ret i8 %r
@@ -127,6 +149,8 @@ define signext i8 @extractelt_nxv8i8_idx(<vscale x 8 x i8> %v, i32 zeroext %idx)
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i8> %v, i32 %idx
ret i8 %r
@@ -137,6 +161,8 @@ define signext i8 @extractelt_nxv16i8_0(<vscale x 16 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i8> %v, i32 0
ret i8 %r
@@ -148,6 +174,8 @@ define signext i8 @extractelt_nxv16i8_imm(<vscale x 16 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i8> %v, i32 2
ret i8 %r
@@ -159,6 +187,8 @@ define signext i8 @extractelt_nxv16i8_idx(<vscale x 16 x i8> %v, i32 zeroext %id
; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i8> %v, i32 %idx
ret i8 %r
@@ -169,6 +199,8 @@ define signext i8 @extractelt_nxv32i8_0(<vscale x 32 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i8> %v, i32 0
ret i8 %r
@@ -180,6 +212,8 @@ define signext i8 @extractelt_nxv32i8_imm(<vscale x 32 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i8> %v, i32 2
ret i8 %r
@@ -191,6 +225,8 @@ define signext i8 @extractelt_nxv32i8_idx(<vscale x 32 x i8> %v, i32 zeroext %id
; CHECK-NEXT: vsetivli zero, 1, e8, m4, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i8> %v, i32 %idx
ret i8 %r
@@ -201,6 +237,8 @@ define signext i8 @extractelt_nxv64i8_0(<vscale x 64 x i8> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 64 x i8> %v, i32 0
ret i8 %r
@@ -212,6 +250,8 @@ define signext i8 @extractelt_nxv64i8_imm(<vscale x 64 x i8> %v) {
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 64 x i8> %v, i32 2
ret i8 %r
@@ -223,6 +263,8 @@ define signext i8 @extractelt_nxv64i8_idx(<vscale x 64 x i8> %v, i32 zeroext %id
; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 56
+; CHECK-NEXT: srai a0, a0, 56
; CHECK-NEXT: ret
%r = extractelement <vscale x 64 x i8> %v, i32 %idx
ret i8 %r
@@ -233,6 +275,8 @@ define signext i16 @extractelt_nxv1i16_0(<vscale x 1 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i16> %v, i32 0
ret i16 %r
@@ -244,6 +288,8 @@ define signext i16 @extractelt_nxv1i16_imm(<vscale x 1 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i16> %v, i32 2
ret i16 %r
@@ -255,6 +301,8 @@ define signext i16 @extractelt_nxv1i16_idx(<vscale x 1 x i16> %v, i32 zeroext %i
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i16> %v, i32 %idx
ret i16 %r
@@ -265,6 +313,8 @@ define signext i16 @extractelt_nxv2i16_0(<vscale x 2 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i16> %v, i32 0
ret i16 %r
@@ -276,6 +326,8 @@ define signext i16 @extractelt_nxv2i16_imm(<vscale x 2 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i16> %v, i32 2
ret i16 %r
@@ -287,6 +339,8 @@ define signext i16 @extractelt_nxv2i16_idx(<vscale x 2 x i16> %v, i32 zeroext %i
; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i16> %v, i32 %idx
ret i16 %r
@@ -297,6 +351,8 @@ define signext i16 @extractelt_nxv4i16_0(<vscale x 4 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i16> %v, i32 0
ret i16 %r
@@ -308,6 +364,8 @@ define signext i16 @extractelt_nxv4i16_imm(<vscale x 4 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i16> %v, i32 2
ret i16 %r
@@ -319,6 +377,8 @@ define signext i16 @extractelt_nxv4i16_idx(<vscale x 4 x i16> %v, i32 zeroext %i
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i16> %v, i32 %idx
ret i16 %r
@@ -329,6 +389,8 @@ define signext i16 @extractelt_nxv8i16_0(<vscale x 8 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i16> %v, i32 0
ret i16 %r
@@ -340,6 +402,8 @@ define signext i16 @extractelt_nxv8i16_imm(<vscale x 8 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i16> %v, i32 2
ret i16 %r
@@ -351,6 +415,8 @@ define signext i16 @extractelt_nxv8i16_idx(<vscale x 8 x i16> %v, i32 zeroext %i
; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i16> %v, i32 %idx
ret i16 %r
@@ -361,6 +427,8 @@ define signext i16 @extractelt_nxv16i16_0(<vscale x 16 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i16> %v, i32 0
ret i16 %r
@@ -372,6 +440,8 @@ define signext i16 @extractelt_nxv16i16_imm(<vscale x 16 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i16> %v, i32 2
ret i16 %r
@@ -383,6 +453,8 @@ define signext i16 @extractelt_nxv16i16_idx(<vscale x 16 x i16> %v, i32 zeroext
; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i16> %v, i32 %idx
ret i16 %r
@@ -393,6 +465,8 @@ define signext i16 @extractelt_nxv32i16_0(<vscale x 32 x i16> %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i16> %v, i32 0
ret i16 %r
@@ -404,6 +478,8 @@ define signext i16 @extractelt_nxv32i16_imm(<vscale x 32 x i16> %v) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i16> %v, i32 2
ret i16 %r
@@ -415,6 +491,8 @@ define signext i16 @extractelt_nxv32i16_idx(<vscale x 32 x i16> %v, i32 zeroext
; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srai a0, a0, 48
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i16> %v, i32 %idx
ret i16 %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 539a8403c93521..fdfe250c632ff2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -332,13 +332,17 @@ define <2 x i64> @mgather_v2i8_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x
; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t
; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1
; RV32ZVE32F-NEXT: vmv.x.s a1, v8
-; RV32ZVE32F-NEXT: srai a2, a1, 31
+; RV32ZVE32F-NEXT: slli a2, a1, 24
+; RV32ZVE32F-NEXT: srai a2, a2, 24
; RV32ZVE32F-NEXT: vmv.x.s a3, v9
-; RV32ZVE32F-NEXT: srai a4, a3, 31
-; RV32ZVE32F-NEXT: sw a3, 0(a0)
-; RV32ZVE32F-NEXT: sw a1, 8(a0)
-; RV32ZVE32F-NEXT: sw a4, 4(a0)
-; RV32ZVE32F-NEXT: sw a2, 12(a0)
+; RV32ZVE32F-NEXT: slli a4, a3, 24
+; RV32ZVE32F-NEXT: srai a4, a4, 24
+; RV32ZVE32F-NEXT: srai a1, a1, 31
+; RV32ZVE32F-NEXT: srai a3, a3, 31
+; RV32ZVE32F-NEXT: sw a3, 4(a0)
+; RV32ZVE32F-NEXT: sw a1, 12(a0)
+; RV32ZVE32F-NEXT: sw a4, 0(a0)
+; RV32ZVE32F-NEXT: sw a2, 8(a0)
; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i64:
@@ -362,8 +366,12 @@ define <2 x i64> @mgather_v2i8_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x
; RV64ZVE32F-NEXT: .LBB6_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: slli a0, a0, 56
+; RV64ZVE32F-NEXT: srai a0, a0, 56
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v8
+; RV64ZVE32F-NEXT: slli a1, a1, 56
+; RV64ZVE32F-NEXT: srai a1, a1, 56
; RV64ZVE32F-NEXT: ret
%v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
%ev = sext <2 x i8> %v to <2 x i64>
@@ -1017,13 +1025,17 @@ define <2 x i64> @mgather_v2i16_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t
; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1
; RV32ZVE32F-NEXT: vmv.x.s a1, v8
-; RV32ZVE32F-NEXT: srai a2, a1, 31
+; RV32ZVE32F-NEXT: slli a2, a1, 16
+; RV32ZVE32F-NEXT: srai a2, a2, 16
; RV32ZVE32F-NEXT: vmv.x.s a3, v9
-; RV32ZVE32F-NEXT: srai a4, a3, 31
-; RV32ZVE32F-NEXT: sw a3, 0(a0)
-; RV32ZVE32F-NEXT: sw a1, 8(a0)
-; RV32ZVE32F-NEXT: sw a4, 4(a0)
-; RV32ZVE32F-NEXT: sw a2, 12(a0)
+; RV32ZVE32F-NEXT: slli a4, a3, 16
+; RV32ZVE32F-NEXT: srai a4, a4, 16
+; RV32ZVE32F-NEXT: srai a1, a1, 31
+; RV32ZVE32F-NEXT: srai a3, a3, 31
+; RV32ZVE32F-NEXT: sw a3, 4(a0)
+; RV32ZVE32F-NEXT: sw a1, 12(a0)
+; RV32ZVE32F-NEXT: sw a4, 0(a0)
+; RV32ZVE32F-NEXT: sw a2, 8(a0)
; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i64:
@@ -1047,8 +1059,12 @@ define <2 x i64> @mgather_v2i16_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
; RV64ZVE32F-NEXT: .LBB17_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: slli a0, a0, 48
+; RV64ZVE32F-NEXT: srai a0, a0, 48
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v8
+; RV64ZVE32F-NEXT: slli a1, a1, 48
+; RV64ZVE32F-NEXT: srai a1, a1, 48
; RV64ZVE32F-NEXT: ret
%v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
%ev = sext <2 x i16> %v to <2 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 02a989a9699606..e423e473cf5527 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -7,14 +7,27 @@
declare i8 @llvm.vp.reduce.add.v2i8(i8, <2 x i8>, <2 x i1>, i32)
define signext i8 @vpreduce_add_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_add_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_add_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.add.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -22,15 +35,29 @@ define signext i8 @vpreduce_add_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3
declare i8 @llvm.vp.reduce.umax.v2i8(i8, <2 x i8>, <2 x i1>, i32)
define signext i8 @vpreduce_umax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umax_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredmaxu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umax_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umax.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -38,14 +65,27 @@ define signext i8 @vpreduce_umax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i
declare i8 @llvm.vp.reduce.smax.v2i8(i8, <2 x i8>, <2 x i1>, i32)
define signext i8 @vpreduce_smax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smax.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -53,15 +93,29 @@ define signext i8 @vpreduce_smax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i
declare i8 @llvm.vp.reduce.umin.v2i8(i8, <2 x i8>, <2 x i1>, i32)
define signext i8 @vpreduce_umin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umin_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umin_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umin.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -69,14 +123,27 @@ define signext i8 @vpreduce_umin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i
declare i8 @llvm.vp.reduce.smin.v2i8(i8, <2 x i8>, <2 x i1>, i32)
define signext i8 @vpreduce_smin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smin_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smin_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smin.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -84,14 +151,27 @@ define signext i8 @vpreduce_smin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i
declare i8 @llvm.vp.reduce.and.v2i8(i8, <2 x i8>, <2 x i1>, i32)
define signext i8 @vpreduce_and_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_and_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_and_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.and.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -99,14 +179,27 @@ define signext i8 @vpreduce_and_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3
declare i8 @llvm.vp.reduce.or.v2i8(i8, <2 x i8>, <2 x i1>, i32)
define signext i8 @vpreduce_or_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_or_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_or_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.or.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -114,14 +207,27 @@ define signext i8 @vpreduce_or_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32
declare i8 @llvm.vp.reduce.xor.v2i8(i8, <2 x i8>, <2 x i1>, i32)
define signext i8 @vpreduce_xor_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_xor_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_xor_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.xor.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -129,15 +235,29 @@ define signext i8 @vpreduce_xor_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3
declare i8 @llvm.vp.reduce.umin.v3i8(i8, <3 x i8>, <3 x i1>, i32)
define signext i8 @vpreduce_umin_v3i8(i8 signext %s, <3 x i8> %v, <3 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umin_v3i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umin_v3i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_v3i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umin.v3i8(i8 %s, <3 x i8> %v, <3 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -145,14 +265,27 @@ define signext i8 @vpreduce_umin_v3i8(i8 signext %s, <3 x i8> %v, <3 x i1> %m, i
declare i8 @llvm.vp.reduce.add.v4i8(i8, <4 x i8>, <4 x i1>, i32)
define signext i8 @vpreduce_add_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_add_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_add_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.add.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -160,15 +293,29 @@ define signext i8 @vpreduce_add_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3
declare i8 @llvm.vp.reduce.umax.v4i8(i8, <4 x i8>, <4 x i1>, i32)
define signext i8 @vpreduce_umax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umax_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredmaxu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umax_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umax.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -176,14 +323,27 @@ define signext i8 @vpreduce_umax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i
declare i8 @llvm.vp.reduce.smax.v4i8(i8, <4 x i8>, <4 x i1>, i32)
define signext i8 @vpreduce_smax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smax.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -191,15 +351,29 @@ define signext i8 @vpreduce_smax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i
declare i8 @llvm.vp.reduce.umin.v4i8(i8, <4 x i8>, <4 x i1>, i32)
define signext i8 @vpreduce_umin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umin_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umin_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umin.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -207,14 +381,27 @@ define signext i8 @vpreduce_umin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i
declare i8 @llvm.vp.reduce.smin.v4i8(i8, <4 x i8>, <4 x i1>, i32)
define signext i8 @vpreduce_smin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smin_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smin_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smin.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -222,14 +409,27 @@ define signext i8 @vpreduce_smin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i
declare i8 @llvm.vp.reduce.and.v4i8(i8, <4 x i8>, <4 x i1>, i32)
define signext i8 @vpreduce_and_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_and_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_and_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.and.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -237,14 +437,27 @@ define signext i8 @vpreduce_and_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3
declare i8 @llvm.vp.reduce.or.v4i8(i8, <4 x i8>, <4 x i1>, i32)
define signext i8 @vpreduce_or_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_or_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_or_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.or.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -252,14 +465,27 @@ define signext i8 @vpreduce_or_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32
declare i8 @llvm.vp.reduce.xor.v4i8(i8, <4 x i8>, <4 x i1>, i32)
define signext i8 @vpreduce_xor_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_xor_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_xor_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.xor.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -267,14 +493,27 @@ define signext i8 @vpreduce_xor_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3
declare i16 @llvm.vp.reduce.add.v2i16(i16, <2 x i16>, <2 x i1>, i32)
define signext i16 @vpreduce_add_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_add_v2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_add_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.add.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -291,6 +530,8 @@ define signext i16 @vpreduce_umax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %
; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vpreduce_umax_v2i16:
@@ -302,6 +543,8 @@ define signext i16 @vpreduce_umax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %
; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t
; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.umax.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
ret i16 %r
@@ -310,14 +553,27 @@ define signext i16 @vpreduce_umax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %
declare i16 @llvm.vp.reduce.smax.v2i16(i16, <2 x i16>, <2 x i1>, i32)
define signext i16 @vpreduce_smax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_v2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.smax.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -334,6 +590,8 @@ define signext i16 @vpreduce_umin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %
; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vpreduce_umin_v2i16:
@@ -345,6 +603,8 @@ define signext i16 @vpreduce_umin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %
; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.umin.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
ret i16 %r
@@ -353,14 +613,27 @@ define signext i16 @vpreduce_umin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %
declare i16 @llvm.vp.reduce.smin.v2i16(i16, <2 x i16>, <2 x i1>, i32)
define signext i16 @vpreduce_smin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smin_v2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smin_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.smin.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -368,14 +641,27 @@ define signext i16 @vpreduce_smin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %
declare i16 @llvm.vp.reduce.and.v2i16(i16, <2 x i16>, <2 x i1>, i32)
define signext i16 @vpreduce_and_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_and_v2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_and_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.and.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -383,14 +669,27 @@ define signext i16 @vpreduce_and_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m
declare i16 @llvm.vp.reduce.or.v2i16(i16, <2 x i16>, <2 x i1>, i32)
define signext i16 @vpreduce_or_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_or_v2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_or_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.or.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -398,14 +697,27 @@ define signext i16 @vpreduce_or_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m,
declare i16 @llvm.vp.reduce.xor.v2i16(i16, <2 x i16>, <2 x i1>, i32)
define signext i16 @vpreduce_xor_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_xor_v2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_xor_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.xor.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -413,14 +725,27 @@ define signext i16 @vpreduce_xor_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m
declare i16 @llvm.vp.reduce.add.v4i16(i16, <4 x i16>, <4 x i1>, i32)
define signext i16 @vpreduce_add_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_add_v4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_add_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.add.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -437,6 +762,8 @@ define signext i16 @vpreduce_umax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %
; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vpreduce_umax_v4i16:
@@ -448,6 +775,8 @@ define signext i16 @vpreduce_umax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %
; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t
; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.umax.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
ret i16 %r
@@ -456,14 +785,27 @@ define signext i16 @vpreduce_umax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %
declare i16 @llvm.vp.reduce.smax.v4i16(i16, <4 x i16>, <4 x i1>, i32)
define signext i16 @vpreduce_smax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_v4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.smax.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -480,6 +822,8 @@ define signext i16 @vpreduce_umin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %
; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vpreduce_umin_v4i16:
@@ -491,6 +835,8 @@ define signext i16 @vpreduce_umin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %
; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.umin.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
ret i16 %r
@@ -499,14 +845,27 @@ define signext i16 @vpreduce_umin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %
declare i16 @llvm.vp.reduce.smin.v4i16(i16, <4 x i16>, <4 x i1>, i32)
define signext i16 @vpreduce_smin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smin_v4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smin_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.smin.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -514,14 +873,27 @@ define signext i16 @vpreduce_smin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %
declare i16 @llvm.vp.reduce.and.v4i16(i16, <4 x i16>, <4 x i1>, i32)
define signext i16 @vpreduce_and_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_and_v4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_and_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.and.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -529,14 +901,27 @@ define signext i16 @vpreduce_and_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m
declare i16 @llvm.vp.reduce.or.v4i16(i16, <4 x i16>, <4 x i1>, i32)
define signext i16 @vpreduce_or_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_or_v4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_or_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.or.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -544,14 +929,27 @@ define signext i16 @vpreduce_or_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m,
declare i16 @llvm.vp.reduce.xor.v4i16(i16, <4 x i16>, <4 x i1>, i32)
define signext i16 @vpreduce_xor_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_xor_v4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_xor_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.xor.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
ret i16 %r
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
index a4ab67f41595d4..cc105ba0a368c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
@@ -3,19 +3,6 @@
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
define signext i16 @sad_4x8_as_i16(<4 x i8> %a, <4 x i8> %b) {
-; CHECK-LABEL: sad_4x8_as_i16:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT: vminu.vv v10, v8, v9
-; CHECK-NEXT: vmaxu.vv v8, v8, v9
-; CHECK-NEXT: vsub.vv v8, v8, v10
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vwredsumu.vs v8, v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
entry:
%1 = zext <4 x i8> %a to <4 x i16>
%3 = zext <4 x i8> %b to <4 x i16>
@@ -48,19 +35,6 @@ entry:
}
define signext i16 @sad_16x8_as_i16(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: sad_16x8_as_i16:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vminu.vv v10, v8, v9
-; CHECK-NEXT: vmaxu.vv v8, v8, v9
-; CHECK-NEXT: vsub.vv v8, v8, v10
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vwredsumu.vs v8, v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
entry:
%1 = zext <16 x i8> %a to <16 x i16>
%3 = zext <16 x i8> %b to <16 x i16>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
index b747d73ce353e2..67db91df410ef6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
@@ -84,25 +84,63 @@ define void @store_v6f64(ptr %p, <6 x double> %v) {
}
define void @store_v6i1(ptr %p, <6 x i1> %v) {
-; CHECK-LABEL: store_v6i1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vfirst.m a1, v0
-; CHECK-NEXT: seqz a1, a1
-; CHECK-NEXT: vmv.x.s a2, v0
-; CHECK-NEXT: andi a3, a2, 2
-; CHECK-NEXT: or a1, a1, a3
-; CHECK-NEXT: andi a3, a2, 4
-; CHECK-NEXT: andi a4, a2, 8
-; CHECK-NEXT: or a3, a3, a4
-; CHECK-NEXT: or a1, a1, a3
-; CHECK-NEXT: andi a3, a2, 16
-; CHECK-NEXT: andi a2, a2, -32
-; CHECK-NEXT: or a2, a3, a2
-; CHECK-NEXT: or a1, a1, a2
-; CHECK-NEXT: andi a1, a1, 63
-; CHECK-NEXT: sb a1, 0(a0)
-; CHECK-NEXT: ret
+; RV32-LABEL: store_v6i1:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT: vfirst.m a1, v0
+; RV32-NEXT: seqz a1, a1
+; RV32-NEXT: vmv.x.s a2, v0
+; RV32-NEXT: slli a3, a2, 30
+; RV32-NEXT: srli a3, a3, 31
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: slli a3, a2, 29
+; RV32-NEXT: srli a3, a3, 31
+; RV32-NEXT: slli a3, a3, 2
+; RV32-NEXT: slli a4, a2, 28
+; RV32-NEXT: srli a4, a4, 31
+; RV32-NEXT: slli a4, a4, 3
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: slli a3, a2, 27
+; RV32-NEXT: srli a3, a3, 31
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: srli a2, a2, 5
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: or a2, a3, a2
+; RV32-NEXT: or a1, a1, a2
+; RV32-NEXT: andi a1, a1, 63
+; RV32-NEXT: sb a1, 0(a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_v6i1:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT: vfirst.m a1, v0
+; RV64-NEXT: seqz a1, a1
+; RV64-NEXT: vmv.x.s a2, v0
+; RV64-NEXT: slli a3, a2, 62
+; RV64-NEXT: srli a3, a3, 63
+; RV64-NEXT: slli a3, a3, 1
+; RV64-NEXT: or a1, a1, a3
+; RV64-NEXT: slli a3, a2, 61
+; RV64-NEXT: srli a3, a3, 63
+; RV64-NEXT: slli a3, a3, 2
+; RV64-NEXT: slli a4, a2, 60
+; RV64-NEXT: srli a4, a4, 63
+; RV64-NEXT: slli a4, a4, 3
+; RV64-NEXT: or a3, a3, a4
+; RV64-NEXT: or a1, a1, a3
+; RV64-NEXT: slli a3, a2, 59
+; RV64-NEXT: srli a3, a3, 63
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: srli a2, a2, 5
+; RV64-NEXT: slli a2, a2, 5
+; RV64-NEXT: or a2, a3, a2
+; RV64-NEXT: or a1, a1, a2
+; RV64-NEXT: andi a1, a1, 63
+; RV64-NEXT: sb a1, 0(a0)
+; RV64-NEXT: ret
store <6 x i1> %v, ptr %p
ret void
}
@@ -294,6 +332,3 @@ define void @exact_vlen_i64_m8(ptr %p) vscale_range(2,2) {
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; RV32: {{.*}}
-; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index 36c36a13964c92..5fc171ba824e78 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -498,13 +498,13 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
; RV32-SLOW-NEXT: # %bb.1: # %cond.load
; RV32-SLOW-NEXT: lbu a3, 1(a0)
; RV32-SLOW-NEXT: lbu a4, 0(a0)
-; RV32-SLOW-NEXT: lbu a5, 2(a0)
-; RV32-SLOW-NEXT: lbu a6, 3(a0)
+; RV32-SLOW-NEXT: lbu a5, 3(a0)
+; RV32-SLOW-NEXT: lbu a6, 2(a0)
; RV32-SLOW-NEXT: slli a3, a3, 8
; RV32-SLOW-NEXT: or a3, a3, a4
-; RV32-SLOW-NEXT: slli a5, a5, 16
-; RV32-SLOW-NEXT: slli a6, a6, 24
-; RV32-SLOW-NEXT: or a4, a6, a5
+; RV32-SLOW-NEXT: slli a5, a5, 8
+; RV32-SLOW-NEXT: or a4, a5, a6
+; RV32-SLOW-NEXT: slli a4, a4, 16
; RV32-SLOW-NEXT: or a3, a4, a3
; RV32-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; RV32-SLOW-NEXT: vmv.v.x v8, a3
@@ -514,13 +514,13 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
; RV32-SLOW-NEXT: # %bb.3: # %cond.load1
; RV32-SLOW-NEXT: lbu a2, 5(a0)
; RV32-SLOW-NEXT: lbu a3, 4(a0)
-; RV32-SLOW-NEXT: lbu a4, 6(a0)
-; RV32-SLOW-NEXT: lbu a0, 7(a0)
+; RV32-SLOW-NEXT: lbu a4, 7(a0)
+; RV32-SLOW-NEXT: lbu a0, 6(a0)
; RV32-SLOW-NEXT: slli a2, a2, 8
; RV32-SLOW-NEXT: or a2, a2, a3
-; RV32-SLOW-NEXT: slli a4, a4, 16
-; RV32-SLOW-NEXT: slli a0, a0, 24
-; RV32-SLOW-NEXT: or a0, a0, a4
+; RV32-SLOW-NEXT: slli a4, a4, 8
+; RV32-SLOW-NEXT: or a0, a4, a0
+; RV32-SLOW-NEXT: slli a0, a0, 16
; RV32-SLOW-NEXT: or a0, a0, a2
; RV32-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; RV32-SLOW-NEXT: vmv.s.x v9, a0
@@ -542,13 +542,13 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
; RV64-SLOW-NEXT: # %bb.1: # %cond.load
; RV64-SLOW-NEXT: lbu a3, 1(a0)
; RV64-SLOW-NEXT: lbu a4, 0(a0)
-; RV64-SLOW-NEXT: lbu a5, 2(a0)
-; RV64-SLOW-NEXT: lb a6, 3(a0)
+; RV64-SLOW-NEXT: lb a5, 3(a0)
+; RV64-SLOW-NEXT: lbu a6, 2(a0)
; RV64-SLOW-NEXT: slli a3, a3, 8
; RV64-SLOW-NEXT: or a3, a3, a4
-; RV64-SLOW-NEXT: slli a5, a5, 16
-; RV64-SLOW-NEXT: slli a6, a6, 24
-; RV64-SLOW-NEXT: or a4, a6, a5
+; RV64-SLOW-NEXT: slli a5, a5, 8
+; RV64-SLOW-NEXT: or a4, a5, a6
+; RV64-SLOW-NEXT: slli a4, a4, 16
; RV64-SLOW-NEXT: or a3, a4, a3
; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; RV64-SLOW-NEXT: vmv.v.x v8, a3
@@ -558,13 +558,13 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
; RV64-SLOW-NEXT: # %bb.3: # %cond.load1
; RV64-SLOW-NEXT: lbu a2, 5(a0)
; RV64-SLOW-NEXT: lbu a3, 4(a0)
-; RV64-SLOW-NEXT: lbu a4, 6(a0)
-; RV64-SLOW-NEXT: lb a0, 7(a0)
+; RV64-SLOW-NEXT: lb a4, 7(a0)
+; RV64-SLOW-NEXT: lbu a0, 6(a0)
; RV64-SLOW-NEXT: slli a2, a2, 8
; RV64-SLOW-NEXT: or a2, a2, a3
-; RV64-SLOW-NEXT: slli a4, a4, 16
-; RV64-SLOW-NEXT: slli a0, a0, 24
-; RV64-SLOW-NEXT: or a0, a0, a4
+; RV64-SLOW-NEXT: slli a4, a4, 8
+; RV64-SLOW-NEXT: or a0, a4, a0
+; RV64-SLOW-NEXT: slli a0, a0, 16
; RV64-SLOW-NEXT: or a0, a0, a2
; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; RV64-SLOW-NEXT: vmv.s.x v9, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index b15896580d4253..9aed6cc63a93c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -304,10 +304,11 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_7(<vscale x 16 x i8> %vec, <vsc
; CHECK-LABEL: insert_nxv16i8_nxv1i8_7:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a1, a0, 3
-; CHECK-NEXT: sub a1, a0, a1
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v10, a1
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: slli a1, a0, 3
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a0
; CHECK-NEXT: ret
%v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 7)
ret <vscale x 16 x i8> %v
@@ -317,10 +318,11 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_15(<vscale x 16 x i8> %vec, <vs
; CHECK-LABEL: insert_nxv16i8_nxv1i8_15:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a1, a0, 3
-; CHECK-NEXT: sub a1, a0, a1
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v9, v10, a1
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: slli a1, a0, 3
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v10, a0
; CHECK-NEXT: ret
%v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 15)
ret <vscale x 16 x i8> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
index eada90e055df96..8ae088aac016d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
@@ -36,9 +36,10 @@ define <vscale x 7 x half> @load_nxv7f16(ptr %ptr, ptr %out) {
; CHECK-LABEL: load_nxv7f16:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: srli a3, a2, 3
-; CHECK-NEXT: sub a2, a2, a3
-; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT: srli a2, a2, 3
+; CHECK-NEXT: slli a3, a2, 3
+; CHECK-NEXT: sub a3, a3, a2
+; CHECK-NEXT: vsetvli zero, a3, e16, m2, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: vse16.v v8, (a1)
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
index 4b4cffc461d462..4ccfa643efa826 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
@@ -22,9 +22,10 @@ define void @store_nxv7f64(<vscale x 7 x double> %val, ptr %ptr) {
; CHECK-LABEL: store_nxv7f64:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: srli a2, a1, 3
-; CHECK-NEXT: sub a1, a1, a2
-; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: sub a2, a2, a1
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a0)
; CHECK-NEXT: ret
store <vscale x 7 x double> %val, ptr %ptr
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-vscale.i64.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-vscale.i64.ll
index d3f3087e06cf5f..decf5ac36db4d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-vscale.i64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-vscale.i64.ll
@@ -49,7 +49,8 @@ define i64 @vscale_one() nounwind {
;
; RV64-VLEN256EXACT-LABEL: vscale_one:
; RV64-VLEN256EXACT: # %bb.0: # %entry
-; RV64-VLEN256EXACT-NEXT: li a0, 4
+; RV64-VLEN256EXACT-NEXT: csrr a0, vlenb
+; RV64-VLEN256EXACT-NEXT: srli a0, a0, 3
; RV64-VLEN256EXACT-NEXT: ret
entry:
%0 = call i64 @llvm.vscale.i64()
@@ -73,7 +74,8 @@ define i64 @vscale_uimmpow2xlen() nounwind {
;
; RV64-VLEN256EXACT-LABEL: vscale_uimmpow2xlen:
; RV64-VLEN256EXACT: # %bb.0: # %entry
-; RV64-VLEN256EXACT-NEXT: li a0, 256
+; RV64-VLEN256EXACT-NEXT: csrr a0, vlenb
+; RV64-VLEN256EXACT-NEXT: slli a0, a0, 3
; RV64-VLEN256EXACT-NEXT: ret
entry:
%0 = call i64 @llvm.vscale.i64()
@@ -128,7 +130,8 @@ define i64 @vscale_select(i32 %x, i32 %y) {
;
; RV64-VLEN256EXACT-LABEL: vscale_select:
; RV64-VLEN256EXACT: # %bb.0:
-; RV64-VLEN256EXACT-NEXT: li a0, 4
+; RV64-VLEN256EXACT-NEXT: csrr a0, vlenb
+; RV64-VLEN256EXACT-NEXT: srli a0, a0, 3
; RV64-VLEN256EXACT-NEXT: ret
%a = call i64 @llvm.vscale.i64()
%b = and i64 %a, 4294967295
@@ -153,7 +156,8 @@ define i64 @vscale_high_bits_zero() nounwind {
;
; RV64-VLEN256EXACT-LABEL: vscale_high_bits_zero:
; RV64-VLEN256EXACT: # %bb.0: # %entry
-; RV64-VLEN256EXACT-NEXT: li a0, 4
+; RV64-VLEN256EXACT-NEXT: csrr a0, vlenb
+; RV64-VLEN256EXACT-NEXT: srli a0, a0, 3
; RV64-VLEN256EXACT-NEXT: ret
entry:
%0 = call i64 @llvm.vscale.i64()
@@ -192,7 +196,8 @@ define i64 @vscale_masked() nounwind {
;
; RV64-VLEN256EXACT-LABEL: vscale_masked:
; RV64-VLEN256EXACT: # %bb.0: # %entry
-; RV64-VLEN256EXACT-NEXT: li a0, 4
+; RV64-VLEN256EXACT-NEXT: csrr a0, vlenb
+; RV64-VLEN256EXACT-NEXT: srli a0, a0, 3
; RV64-VLEN256EXACT-NEXT: ret
entry:
%0 = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
index 122ac13cb25731..0bd250f952eb05 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
@@ -14,21 +14,22 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
; RV32-NEXT: lw a0, 0(a0)
; RV32-NEXT: srli a2, a0, 16
; RV32-NEXT: slli a3, a0, 16
-; RV32-NEXT: srli a4, a3, 24
-; RV32-NEXT: srai a3, a3, 24
-; RV32-NEXT: slli a5, a0, 24
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: slli a4, a0, 24
+; RV32-NEXT: srai a4, a4, 24
+; RV32-NEXT: slli a5, a3, 24
; RV32-NEXT: srai a5, a5, 24
-; RV32-NEXT: slli a6, a0, 8
+; RV32-NEXT: slli a6, a2, 24
; RV32-NEXT: srai a6, a6, 24
; RV32-NEXT: sgtz a6, a6
; RV32-NEXT: sgtz a5, a5
-; RV32-NEXT: sgtz a3, a3
-; RV32-NEXT: neg a3, a3
-; RV32-NEXT: and a3, a3, a4
-; RV32-NEXT: slli a3, a3, 8
-; RV32-NEXT: neg a4, a5
+; RV32-NEXT: sgtz a4, a4
+; RV32-NEXT: neg a4, a4
; RV32-NEXT: and a0, a4, a0
; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: neg a4, a5
+; RV32-NEXT: and a3, a4, a3
+; RV32-NEXT: slli a3, a3, 8
; RV32-NEXT: or a0, a0, a3
; RV32-NEXT: neg a3, a6
; RV32-NEXT: and a2, a3, a2
@@ -41,21 +42,22 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
; RV64-NEXT: lw a0, 0(a0)
; RV64-NEXT: srliw a2, a0, 16
; RV64-NEXT: slli a3, a0, 48
-; RV64-NEXT: srli a4, a3, 56
-; RV64-NEXT: srai a3, a3, 56
-; RV64-NEXT: slli a5, a0, 56
+; RV64-NEXT: srli a3, a3, 56
+; RV64-NEXT: slli a4, a0, 56
+; RV64-NEXT: srai a4, a4, 56
+; RV64-NEXT: slli a5, a3, 56
; RV64-NEXT: srai a5, a5, 56
-; RV64-NEXT: slli a6, a0, 40
+; RV64-NEXT: slli a6, a2, 56
; RV64-NEXT: srai a6, a6, 56
; RV64-NEXT: sgtz a6, a6
; RV64-NEXT: sgtz a5, a5
-; RV64-NEXT: sgtz a3, a3
-; RV64-NEXT: negw a3, a3
-; RV64-NEXT: and a3, a3, a4
-; RV64-NEXT: slli a3, a3, 8
-; RV64-NEXT: negw a4, a5
+; RV64-NEXT: sgtz a4, a4
+; RV64-NEXT: negw a4, a4
; RV64-NEXT: and a0, a4, a0
; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: negw a4, a5
+; RV64-NEXT: and a3, a4, a3
+; RV64-NEXT: slli a3, a3, 8
; RV64-NEXT: or a0, a0, a3
; RV64-NEXT: negw a3, a6
; RV64-NEXT: and a2, a3, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
index 7bcf37b1af3c8f..89d106d11b23fd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
@@ -7,14 +7,27 @@
declare i8 @llvm.vp.reduce.add.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
define signext i8 @vpreduce_add_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_add_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_add_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.add.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -22,15 +35,29 @@ define signext i8 @vpreduce_add_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vsc
declare i8 @llvm.vp.reduce.umax.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
define signext i8 @vpreduce_umax_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umax_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredmaxu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umax_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umax.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -38,14 +65,27 @@ define signext i8 @vpreduce_umax_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vs
declare i8 @llvm.vp.reduce.smax.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
define signext i8 @vpreduce_smax_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smax.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -53,15 +93,29 @@ define signext i8 @vpreduce_smax_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vs
declare i8 @llvm.vp.reduce.umin.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
define signext i8 @vpreduce_umin_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umin_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umin_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umin.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -69,14 +123,27 @@ define signext i8 @vpreduce_umin_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vs
declare i8 @llvm.vp.reduce.smin.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
define signext i8 @vpreduce_smin_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smin_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smin_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smin.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -84,14 +151,27 @@ define signext i8 @vpreduce_smin_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vs
declare i8 @llvm.vp.reduce.and.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
define signext i8 @vpreduce_and_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_and_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_and_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.and.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -99,14 +179,27 @@ define signext i8 @vpreduce_and_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vsc
declare i8 @llvm.vp.reduce.or.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
define signext i8 @vpreduce_or_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_or_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_or_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.or.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -114,14 +207,27 @@ define signext i8 @vpreduce_or_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vsca
declare i8 @llvm.vp.reduce.xor.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
define signext i8 @vpreduce_xor_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_xor_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_xor_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.xor.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -129,14 +235,27 @@ define signext i8 @vpreduce_xor_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vsc
declare i8 @llvm.vp.reduce.add.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
define signext i8 @vpreduce_add_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_add_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_add_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.add.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -144,15 +263,29 @@ define signext i8 @vpreduce_add_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vsc
declare i8 @llvm.vp.reduce.umax.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
define signext i8 @vpreduce_umax_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umax_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredmaxu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umax_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umax.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -160,14 +293,27 @@ define signext i8 @vpreduce_umax_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vs
declare i8 @llvm.vp.reduce.smax.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
define signext i8 @vpreduce_smax_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smax.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -175,15 +321,29 @@ define signext i8 @vpreduce_smax_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vs
declare i8 @llvm.vp.reduce.umin.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
define signext i8 @vpreduce_umin_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umin_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umin_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umin.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -191,14 +351,27 @@ define signext i8 @vpreduce_umin_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vs
declare i8 @llvm.vp.reduce.smin.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
define signext i8 @vpreduce_smin_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smin_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smin_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smin.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -206,14 +379,27 @@ define signext i8 @vpreduce_smin_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vs
declare i8 @llvm.vp.reduce.and.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
define signext i8 @vpreduce_and_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_and_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_and_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.and.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -221,14 +407,27 @@ define signext i8 @vpreduce_and_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vsc
declare i8 @llvm.vp.reduce.or.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
define signext i8 @vpreduce_or_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_or_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_or_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.or.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -236,14 +435,27 @@ define signext i8 @vpreduce_or_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vsca
declare i8 @llvm.vp.reduce.xor.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
define signext i8 @vpreduce_xor_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_xor_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_xor_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.xor.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -251,14 +463,27 @@ define signext i8 @vpreduce_xor_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vsc
declare i8 @llvm.vp.reduce.smax.nxv3i8(i8, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
define signext i8 @vpreduce_smax_nxv3i8(i8 signext %s, <vscale x 3 x i8> %v, <vscale x 3 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_nxv3i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_nxv3i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_nxv3i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smax.nxv3i8(i8 %s, <vscale x 3 x i8> %v, <vscale x 3 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -266,14 +491,27 @@ define signext i8 @vpreduce_smax_nxv3i8(i8 signext %s, <vscale x 3 x i8> %v, <vs
declare i8 @llvm.vp.reduce.add.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
define signext i8 @vpreduce_add_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_add_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_add_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.add.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -281,15 +519,29 @@ define signext i8 @vpreduce_add_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vsc
declare i8 @llvm.vp.reduce.umax.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
define signext i8 @vpreduce_umax_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umax_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT: vredmaxu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umax_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umax.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -297,14 +549,27 @@ define signext i8 @vpreduce_umax_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vs
declare i8 @llvm.vp.reduce.smax.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
define signext i8 @vpreduce_smax_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smax.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -312,15 +577,29 @@ define signext i8 @vpreduce_smax_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vs
declare i8 @llvm.vp.reduce.umin.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
define signext i8 @vpreduce_umin_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_umin_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_umin_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.umin.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -328,14 +607,27 @@ define signext i8 @vpreduce_umin_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vs
declare i8 @llvm.vp.reduce.smin.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
define signext i8 @vpreduce_smin_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smin_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smin_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.smin.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -343,14 +635,27 @@ define signext i8 @vpreduce_smin_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vs
declare i8 @llvm.vp.reduce.and.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
define signext i8 @vpreduce_and_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_and_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_and_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.and.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -358,14 +663,27 @@ define signext i8 @vpreduce_and_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vsc
declare i8 @llvm.vp.reduce.or.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
define signext i8 @vpreduce_or_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_or_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_or_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.or.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -373,14 +691,27 @@ define signext i8 @vpreduce_or_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vsca
declare i8 @llvm.vp.reduce.xor.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
define signext i8 @vpreduce_xor_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_xor_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_xor_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%r = call i8 @llvm.vp.reduce.xor.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i8 %r
}
@@ -388,14 +719,27 @@ define signext i8 @vpreduce_xor_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vsc
declare i16 @llvm.vp.reduce.add.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
define signext i16 @vpreduce_add_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_add_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_add_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.add.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -412,6 +756,8 @@ define signext i16 @vpreduce_umax_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v,
; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vpreduce_umax_nxv1i16:
@@ -423,6 +769,8 @@ define signext i16 @vpreduce_umax_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v,
; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t
; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.umax.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i16 %r
@@ -431,14 +779,27 @@ define signext i16 @vpreduce_umax_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v,
declare i16 @llvm.vp.reduce.smax.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
define signext i16 @vpreduce_smax_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.smax.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -455,6 +816,8 @@ define signext i16 @vpreduce_umin_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v,
; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vpreduce_umin_nxv1i16:
@@ -466,6 +829,8 @@ define signext i16 @vpreduce_umin_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v,
; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.umin.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i16 %r
@@ -474,14 +839,27 @@ define signext i16 @vpreduce_umin_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v,
declare i16 @llvm.vp.reduce.smin.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
define signext i16 @vpreduce_smin_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smin_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smin_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.smin.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -489,14 +867,27 @@ define signext i16 @vpreduce_smin_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v,
declare i16 @llvm.vp.reduce.and.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
define signext i16 @vpreduce_and_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_and_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_and_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.and.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -504,14 +895,27 @@ define signext i16 @vpreduce_and_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v,
declare i16 @llvm.vp.reduce.or.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
define signext i16 @vpreduce_or_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_or_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_or_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.or.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -519,14 +923,27 @@ define signext i16 @vpreduce_or_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <
declare i16 @llvm.vp.reduce.xor.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
define signext i16 @vpreduce_xor_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_xor_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_xor_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.xor.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -534,14 +951,27 @@ define signext i16 @vpreduce_xor_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v,
declare i16 @llvm.vp.reduce.add.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
define signext i16 @vpreduce_add_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_add_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_add_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.add.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -558,6 +988,8 @@ define signext i16 @vpreduce_umax_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v,
; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vpreduce_umax_nxv2i16:
@@ -569,6 +1001,8 @@ define signext i16 @vpreduce_umax_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v,
; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t
; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.umax.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i16 %r
@@ -577,14 +1011,27 @@ define signext i16 @vpreduce_umax_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v,
declare i16 @llvm.vp.reduce.smax.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
define signext i16 @vpreduce_smax_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.smax.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -601,6 +1048,8 @@ define signext i16 @vpreduce_umin_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v,
; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vpreduce_umin_nxv2i16:
@@ -612,6 +1061,8 @@ define signext i16 @vpreduce_umin_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v,
; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.umin.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i16 %r
@@ -620,14 +1071,27 @@ define signext i16 @vpreduce_umin_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v,
declare i16 @llvm.vp.reduce.smin.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
define signext i16 @vpreduce_smin_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smin_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smin_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.smin.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -635,14 +1099,27 @@ define signext i16 @vpreduce_smin_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v,
declare i16 @llvm.vp.reduce.and.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
define signext i16 @vpreduce_and_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_and_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_and_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.and.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -650,14 +1127,27 @@ define signext i16 @vpreduce_and_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v,
declare i16 @llvm.vp.reduce.or.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
define signext i16 @vpreduce_or_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_or_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_or_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.or.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -665,14 +1155,27 @@ define signext i16 @vpreduce_or_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <
declare i16 @llvm.vp.reduce.xor.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
define signext i16 @vpreduce_xor_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_xor_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_xor_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.xor.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -680,14 +1183,27 @@ define signext i16 @vpreduce_xor_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v,
declare i16 @llvm.vp.reduce.add.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
define signext i16 @vpreduce_add_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_add_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_add_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.add.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -704,6 +1220,8 @@ define signext i16 @vpreduce_umax_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v,
; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vpreduce_umax_nxv4i16:
@@ -715,6 +1233,8 @@ define signext i16 @vpreduce_umax_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v,
; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t
; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.umax.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i16 %r
@@ -723,14 +1243,27 @@ define signext i16 @vpreduce_umax_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v,
declare i16 @llvm.vp.reduce.smax.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
define signext i16 @vpreduce_smax_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smax_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smax_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.smax.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -747,6 +1280,8 @@ define signext i16 @vpreduce_umin_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v,
; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vpreduce_umin_nxv4i16:
@@ -758,6 +1293,8 @@ define signext i16 @vpreduce_umin_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v,
; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t
; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.umin.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i16 %r
@@ -766,14 +1303,27 @@ define signext i16 @vpreduce_umin_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v,
declare i16 @llvm.vp.reduce.smin.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
define signext i16 @vpreduce_smin_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_smin_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_smin_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.smin.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -781,14 +1331,27 @@ define signext i16 @vpreduce_smin_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v,
declare i16 @llvm.vp.reduce.and.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
define signext i16 @vpreduce_and_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_and_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_and_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV32-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV64-NEXT: vredand.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.and.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -796,14 +1359,27 @@ define signext i16 @vpreduce_and_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v,
declare i16 @llvm.vp.reduce.or.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
define signext i16 @vpreduce_or_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_or_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_or_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV32-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV64-NEXT: vredor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.or.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i16 %r
}
@@ -811,14 +1387,27 @@ define signext i16 @vpreduce_or_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <
declare i16 @llvm.vp.reduce.xor.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
define signext i16 @vpreduce_xor_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vpreduce_xor_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: ret
+; RV32-LABEL: vpreduce_xor_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%r = call i16 @llvm.vp.reduce.xor.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
ret i16 %r
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll
index bcab7d05e698ee..9653f93cac1def 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll
@@ -7,13 +7,25 @@
declare i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8>)
define signext i8 @vreduce_add_nxv1i8(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: vreduce_add_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredsum.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_add_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredsum.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_add_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredsum.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8> %v)
ret i8 %red
}
@@ -21,12 +33,23 @@ define signext i8 @vreduce_add_nxv1i8(<vscale x 1 x i8> %v) {
declare i8 @llvm.vector.reduce.umax.nxv1i8(<vscale x 1 x i8>)
define signext i8 @vreduce_umax_nxv1i8(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: vreduce_umax_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vredmaxu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umax_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umax_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV64-NEXT: vredmaxu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.umax.nxv1i8(<vscale x 1 x i8> %v)
ret i8 %red
}
@@ -34,12 +57,23 @@ define signext i8 @vreduce_umax_nxv1i8(<vscale x 1 x i8> %v) {
declare i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8>)
define signext i8 @vreduce_smax_nxv1i8(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: vreduce_smax_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vredmax.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smax_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV32-NEXT: vredmax.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smax_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV64-NEXT: vredmax.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8> %v)
ret i8 %red
}
@@ -47,12 +81,23 @@ define signext i8 @vreduce_smax_nxv1i8(<vscale x 1 x i8> %v) {
declare i8 @llvm.vector.reduce.umin.nxv1i8(<vscale x 1 x i8>)
define signext i8 @vreduce_umin_nxv1i8(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: vreduce_umin_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vredminu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umin_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV32-NEXT: vredminu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umin_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV64-NEXT: vredminu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.umin.nxv1i8(<vscale x 1 x i8> %v)
ret i8 %red
}
@@ -60,12 +105,23 @@ define signext i8 @vreduce_umin_nxv1i8(<vscale x 1 x i8> %v) {
declare i8 @llvm.vector.reduce.smin.nxv1i8(<vscale x 1 x i8>)
define signext i8 @vreduce_smin_nxv1i8(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: vreduce_smin_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vredmin.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smin_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV32-NEXT: vredmin.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smin_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV64-NEXT: vredmin.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.smin.nxv1i8(<vscale x 1 x i8> %v)
ret i8 %red
}
@@ -73,12 +129,23 @@ define signext i8 @vreduce_smin_nxv1i8(<vscale x 1 x i8> %v) {
declare i8 @llvm.vector.reduce.and.nxv1i8(<vscale x 1 x i8>)
define signext i8 @vreduce_and_nxv1i8(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: vreduce_and_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV32-NEXT: vredand.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV64-NEXT: vredand.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.and.nxv1i8(<vscale x 1 x i8> %v)
ret i8 %red
}
@@ -86,12 +153,23 @@ define signext i8 @vreduce_and_nxv1i8(<vscale x 1 x i8> %v) {
declare i8 @llvm.vector.reduce.or.nxv1i8(<vscale x 1 x i8>)
define signext i8 @vreduce_or_nxv1i8(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: vreduce_or_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV32-NEXT: vredor.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV64-NEXT: vredor.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.or.nxv1i8(<vscale x 1 x i8> %v)
ret i8 %red
}
@@ -99,13 +177,25 @@ define signext i8 @vreduce_or_nxv1i8(<vscale x 1 x i8> %v) {
declare i8 @llvm.vector.reduce.xor.nxv1i8(<vscale x 1 x i8>)
define signext i8 @vreduce_xor_nxv1i8(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: vreduce_xor_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredxor.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.xor.nxv1i8(<vscale x 1 x i8> %v)
ret i8 %red
}
@@ -113,13 +203,25 @@ define signext i8 @vreduce_xor_nxv1i8(<vscale x 1 x i8> %v) {
declare i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8>)
define signext i8 @vreduce_add_nxv2i8(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: vreduce_add_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredsum.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_add_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredsum.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_add_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredsum.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %v)
ret i8 %red
}
@@ -127,12 +229,23 @@ define signext i8 @vreduce_add_nxv2i8(<vscale x 2 x i8> %v) {
declare i8 @llvm.vector.reduce.umax.nxv2i8(<vscale x 2 x i8>)
define signext i8 @vreduce_umax_nxv2i8(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: vreduce_umax_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vredmaxu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umax_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umax_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV64-NEXT: vredmaxu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.umax.nxv2i8(<vscale x 2 x i8> %v)
ret i8 %red
}
@@ -140,12 +253,23 @@ define signext i8 @vreduce_umax_nxv2i8(<vscale x 2 x i8> %v) {
declare i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8>)
define signext i8 @vreduce_smax_nxv2i8(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: vreduce_smax_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vredmax.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smax_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV32-NEXT: vredmax.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smax_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV64-NEXT: vredmax.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8> %v)
ret i8 %red
}
@@ -153,12 +277,23 @@ define signext i8 @vreduce_smax_nxv2i8(<vscale x 2 x i8> %v) {
declare i8 @llvm.vector.reduce.umin.nxv2i8(<vscale x 2 x i8>)
define signext i8 @vreduce_umin_nxv2i8(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: vreduce_umin_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vredminu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umin_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV32-NEXT: vredminu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umin_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV64-NEXT: vredminu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.umin.nxv2i8(<vscale x 2 x i8> %v)
ret i8 %red
}
@@ -166,12 +301,23 @@ define signext i8 @vreduce_umin_nxv2i8(<vscale x 2 x i8> %v) {
declare i8 @llvm.vector.reduce.smin.nxv2i8(<vscale x 2 x i8>)
define signext i8 @vreduce_smin_nxv2i8(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: vreduce_smin_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vredmin.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smin_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV32-NEXT: vredmin.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smin_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV64-NEXT: vredmin.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.smin.nxv2i8(<vscale x 2 x i8> %v)
ret i8 %red
}
@@ -179,12 +325,23 @@ define signext i8 @vreduce_smin_nxv2i8(<vscale x 2 x i8> %v) {
declare i8 @llvm.vector.reduce.and.nxv2i8(<vscale x 2 x i8>)
define signext i8 @vreduce_and_nxv2i8(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: vreduce_and_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV32-NEXT: vredand.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV64-NEXT: vredand.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.and.nxv2i8(<vscale x 2 x i8> %v)
ret i8 %red
}
@@ -192,12 +349,23 @@ define signext i8 @vreduce_and_nxv2i8(<vscale x 2 x i8> %v) {
declare i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8>)
define signext i8 @vreduce_or_nxv2i8(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: vreduce_or_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV32-NEXT: vredor.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV64-NEXT: vredor.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8> %v)
ret i8 %red
}
@@ -205,13 +373,25 @@ define signext i8 @vreduce_or_nxv2i8(<vscale x 2 x i8> %v) {
declare i8 @llvm.vector.reduce.xor.nxv2i8(<vscale x 2 x i8>)
define signext i8 @vreduce_xor_nxv2i8(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: vreduce_xor_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredxor.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.xor.nxv2i8(<vscale x 2 x i8> %v)
ret i8 %red
}
@@ -219,13 +399,25 @@ define signext i8 @vreduce_xor_nxv2i8(<vscale x 2 x i8> %v) {
declare i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8>)
define signext i8 @vreduce_add_nxv4i8(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: vreduce_add_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredsum.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_add_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredsum.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_add_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredsum.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %v)
ret i8 %red
}
@@ -233,12 +425,23 @@ define signext i8 @vreduce_add_nxv4i8(<vscale x 4 x i8> %v) {
declare i8 @llvm.vector.reduce.umax.nxv4i8(<vscale x 4 x i8>)
define signext i8 @vreduce_umax_nxv4i8(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: vreduce_umax_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vredmaxu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umax_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umax_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vredmaxu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.umax.nxv4i8(<vscale x 4 x i8> %v)
ret i8 %red
}
@@ -246,25 +449,47 @@ define signext i8 @vreduce_umax_nxv4i8(<vscale x 4 x i8> %v) {
declare i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8>)
define signext i8 @vreduce_smax_nxv4i8(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: vreduce_smax_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vredmax.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
- %red = call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> %v)
- ret i8 %red
-}
+; RV32-LABEL: vreduce_smax_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV32-NEXT: vredmax.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smax_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vredmax.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
+ %red = call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> %v)
+ ret i8 %red
+}
declare i8 @llvm.vector.reduce.umin.nxv4i8(<vscale x 4 x i8>)
define signext i8 @vreduce_umin_nxv4i8(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: vreduce_umin_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vredminu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umin_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV32-NEXT: vredminu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umin_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vredminu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.umin.nxv4i8(<vscale x 4 x i8> %v)
ret i8 %red
}
@@ -272,12 +497,23 @@ define signext i8 @vreduce_umin_nxv4i8(<vscale x 4 x i8> %v) {
declare i8 @llvm.vector.reduce.smin.nxv4i8(<vscale x 4 x i8>)
define signext i8 @vreduce_smin_nxv4i8(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: vreduce_smin_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vredmin.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smin_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV32-NEXT: vredmin.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smin_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vredmin.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.smin.nxv4i8(<vscale x 4 x i8> %v)
ret i8 %red
}
@@ -285,12 +521,23 @@ define signext i8 @vreduce_smin_nxv4i8(<vscale x 4 x i8> %v) {
declare i8 @llvm.vector.reduce.and.nxv4i8(<vscale x 4 x i8>)
define signext i8 @vreduce_and_nxv4i8(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: vreduce_and_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV32-NEXT: vredand.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vredand.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.and.nxv4i8(<vscale x 4 x i8> %v)
ret i8 %red
}
@@ -298,12 +545,23 @@ define signext i8 @vreduce_and_nxv4i8(<vscale x 4 x i8> %v) {
declare i8 @llvm.vector.reduce.or.nxv4i8(<vscale x 4 x i8>)
define signext i8 @vreduce_or_nxv4i8(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: vreduce_or_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV32-NEXT: vredor.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vredor.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.or.nxv4i8(<vscale x 4 x i8> %v)
ret i8 %red
}
@@ -311,13 +569,25 @@ define signext i8 @vreduce_or_nxv4i8(<vscale x 4 x i8> %v) {
declare i8 @llvm.vector.reduce.xor.nxv4i8(<vscale x 4 x i8>)
define signext i8 @vreduce_xor_nxv4i8(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: vreduce_xor_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredxor.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
%red = call i8 @llvm.vector.reduce.xor.nxv4i8(<vscale x 4 x i8> %v)
ret i8 %red
}
@@ -325,42 +595,82 @@ define signext i8 @vreduce_xor_nxv4i8(<vscale x 4 x i8> %v) {
declare i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16>)
define signext i16 @vreduce_add_nxv1i16(<vscale x 1 x i16> %v) {
-; CHECK-LABEL: vreduce_add_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredsum.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_add_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredsum.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_add_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredsum.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %v)
ret i16 %red
}
define signext i16 @vwreduce_add_nxv1i8(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: vwreduce_add_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vwredsum.vs v8, v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vwreduce_add_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV32-NEXT: vwredsum.vs v8, v8, v9
+; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwreduce_add_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV64-NEXT: vwredsum.vs v8, v8, v9
+; RV64-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%e = sext <vscale x 1 x i8> %v to <vscale x 1 x i16>
%red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %e)
ret i16 %red
}
define signext i16 @vwreduce_uadd_nxv1i8(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: vwreduce_uadd_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vwredsum.vs v8, v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vwreduce_uadd_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV32-NEXT: vwredsum.vs v8, v8, v9
+; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwreduce_uadd_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; RV64-NEXT: vwredsum.vs v8, v8, v9
+; RV64-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%e = sext <vscale x 1 x i8> %v to <vscale x 1 x i16>
%red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %e)
ret i16 %red
@@ -369,12 +679,23 @@ define signext i16 @vwreduce_uadd_nxv1i8(<vscale x 1 x i8> %v) {
declare i16 @llvm.vector.reduce.umax.nxv1i16(<vscale x 1 x i16>)
define signext i16 @vreduce_umax_nxv1i16(<vscale x 1 x i16> %v) {
-; CHECK-LABEL: vreduce_umax_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vredmaxu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umax_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umax_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV64-NEXT: vredmaxu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.umax.nxv1i16(<vscale x 1 x i16> %v)
ret i16 %red
}
@@ -382,12 +703,23 @@ define signext i16 @vreduce_umax_nxv1i16(<vscale x 1 x i16> %v) {
declare i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16>)
define signext i16 @vreduce_smax_nxv1i16(<vscale x 1 x i16> %v) {
-; CHECK-LABEL: vreduce_smax_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vredmax.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smax_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV32-NEXT: vredmax.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smax_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV64-NEXT: vredmax.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16> %v)
ret i16 %red
}
@@ -395,12 +727,23 @@ define signext i16 @vreduce_smax_nxv1i16(<vscale x 1 x i16> %v) {
declare i16 @llvm.vector.reduce.umin.nxv1i16(<vscale x 1 x i16>)
define signext i16 @vreduce_umin_nxv1i16(<vscale x 1 x i16> %v) {
-; CHECK-LABEL: vreduce_umin_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vredminu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umin_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV32-NEXT: vredminu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umin_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV64-NEXT: vredminu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.umin.nxv1i16(<vscale x 1 x i16> %v)
ret i16 %red
}
@@ -408,12 +751,23 @@ define signext i16 @vreduce_umin_nxv1i16(<vscale x 1 x i16> %v) {
declare i16 @llvm.vector.reduce.smin.nxv1i16(<vscale x 1 x i16>)
define signext i16 @vreduce_smin_nxv1i16(<vscale x 1 x i16> %v) {
-; CHECK-LABEL: vreduce_smin_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vredmin.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smin_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV32-NEXT: vredmin.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smin_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV64-NEXT: vredmin.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.smin.nxv1i16(<vscale x 1 x i16> %v)
ret i16 %red
}
@@ -421,12 +775,23 @@ define signext i16 @vreduce_smin_nxv1i16(<vscale x 1 x i16> %v) {
declare i16 @llvm.vector.reduce.and.nxv1i16(<vscale x 1 x i16>)
define signext i16 @vreduce_and_nxv1i16(<vscale x 1 x i16> %v) {
-; CHECK-LABEL: vreduce_and_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV32-NEXT: vredand.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV64-NEXT: vredand.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.and.nxv1i16(<vscale x 1 x i16> %v)
ret i16 %red
}
@@ -434,12 +799,23 @@ define signext i16 @vreduce_and_nxv1i16(<vscale x 1 x i16> %v) {
declare i16 @llvm.vector.reduce.or.nxv1i16(<vscale x 1 x i16>)
define signext i16 @vreduce_or_nxv1i16(<vscale x 1 x i16> %v) {
-; CHECK-LABEL: vreduce_or_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV32-NEXT: vredor.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV64-NEXT: vredor.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.or.nxv1i16(<vscale x 1 x i16> %v)
ret i16 %red
}
@@ -447,13 +823,25 @@ define signext i16 @vreduce_or_nxv1i16(<vscale x 1 x i16> %v) {
declare i16 @llvm.vector.reduce.xor.nxv1i16(<vscale x 1 x i16>)
define signext i16 @vreduce_xor_nxv1i16(<vscale x 1 x i16> %v) {
-; CHECK-LABEL: vreduce_xor_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredxor.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.xor.nxv1i16(<vscale x 1 x i16> %v)
ret i16 %red
}
@@ -461,42 +849,82 @@ define signext i16 @vreduce_xor_nxv1i16(<vscale x 1 x i16> %v) {
declare i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16>)
define signext i16 @vreduce_add_nxv2i16(<vscale x 2 x i16> %v) {
-; CHECK-LABEL: vreduce_add_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredsum.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_add_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredsum.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_add_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredsum.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %v)
ret i16 %red
}
define signext i16 @vwreduce_add_nxv2i8(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: vwreduce_add_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vwredsum.vs v8, v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vwreduce_add_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV32-NEXT: vwredsum.vs v8, v8, v9
+; RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwreduce_add_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV64-NEXT: vwredsum.vs v8, v8, v9
+; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%e = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
%red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %e)
ret i16 %red
}
define signext i16 @vwreduce_uadd_nxv2i8(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: vwreduce_uadd_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vwredsum.vs v8, v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vwreduce_uadd_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV32-NEXT: vwredsum.vs v8, v8, v9
+; RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwreduce_uadd_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; RV64-NEXT: vwredsum.vs v8, v8, v9
+; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%e = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
%red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %e)
ret i16 %red
@@ -505,12 +933,23 @@ define signext i16 @vwreduce_uadd_nxv2i8(<vscale x 2 x i8> %v) {
declare i16 @llvm.vector.reduce.umax.nxv2i16(<vscale x 2 x i16>)
define signext i16 @vreduce_umax_nxv2i16(<vscale x 2 x i16> %v) {
-; CHECK-LABEL: vreduce_umax_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vredmaxu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umax_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umax_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV64-NEXT: vredmaxu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.umax.nxv2i16(<vscale x 2 x i16> %v)
ret i16 %red
}
@@ -518,12 +957,23 @@ define signext i16 @vreduce_umax_nxv2i16(<vscale x 2 x i16> %v) {
declare i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16>)
define signext i16 @vreduce_smax_nxv2i16(<vscale x 2 x i16> %v) {
-; CHECK-LABEL: vreduce_smax_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vredmax.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smax_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV32-NEXT: vredmax.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smax_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV64-NEXT: vredmax.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16> %v)
ret i16 %red
}
@@ -531,12 +981,23 @@ define signext i16 @vreduce_smax_nxv2i16(<vscale x 2 x i16> %v) {
declare i16 @llvm.vector.reduce.umin.nxv2i16(<vscale x 2 x i16>)
define signext i16 @vreduce_umin_nxv2i16(<vscale x 2 x i16> %v) {
-; CHECK-LABEL: vreduce_umin_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vredminu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umin_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV32-NEXT: vredminu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umin_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV64-NEXT: vredminu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.umin.nxv2i16(<vscale x 2 x i16> %v)
ret i16 %red
}
@@ -544,12 +1005,23 @@ define signext i16 @vreduce_umin_nxv2i16(<vscale x 2 x i16> %v) {
declare i16 @llvm.vector.reduce.smin.nxv2i16(<vscale x 2 x i16>)
define signext i16 @vreduce_smin_nxv2i16(<vscale x 2 x i16> %v) {
-; CHECK-LABEL: vreduce_smin_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vredmin.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smin_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV32-NEXT: vredmin.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smin_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV64-NEXT: vredmin.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.smin.nxv2i16(<vscale x 2 x i16> %v)
ret i16 %red
}
@@ -557,12 +1029,23 @@ define signext i16 @vreduce_smin_nxv2i16(<vscale x 2 x i16> %v) {
declare i16 @llvm.vector.reduce.and.nxv2i16(<vscale x 2 x i16>)
define signext i16 @vreduce_and_nxv2i16(<vscale x 2 x i16> %v) {
-; CHECK-LABEL: vreduce_and_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV32-NEXT: vredand.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV64-NEXT: vredand.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.and.nxv2i16(<vscale x 2 x i16> %v)
ret i16 %red
}
@@ -570,12 +1053,23 @@ define signext i16 @vreduce_and_nxv2i16(<vscale x 2 x i16> %v) {
declare i16 @llvm.vector.reduce.or.nxv2i16(<vscale x 2 x i16>)
define signext i16 @vreduce_or_nxv2i16(<vscale x 2 x i16> %v) {
-; CHECK-LABEL: vreduce_or_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV32-NEXT: vredor.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV64-NEXT: vredor.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.or.nxv2i16(<vscale x 2 x i16> %v)
ret i16 %red
}
@@ -583,13 +1077,25 @@ define signext i16 @vreduce_or_nxv2i16(<vscale x 2 x i16> %v) {
declare i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16>)
define signext i16 @vreduce_xor_nxv2i16(<vscale x 2 x i16> %v) {
-; CHECK-LABEL: vreduce_xor_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredxor.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16> %v)
ret i16 %red
}
@@ -597,42 +1103,82 @@ define signext i16 @vreduce_xor_nxv2i16(<vscale x 2 x i16> %v) {
declare i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16>)
define signext i16 @vreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
-; CHECK-LABEL: vreduce_add_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredsum.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_add_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredsum.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_add_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredsum.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %v)
ret i16 %red
}
define signext i16 @vwreduce_add_nxv4i8(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: vwreduce_add_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vwredsum.vs v8, v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vwreduce_add_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; RV32-NEXT: vwredsum.vs v8, v8, v9
+; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwreduce_add_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; RV64-NEXT: vwredsum.vs v8, v8, v9
+; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%e = sext <vscale x 4 x i8> %v to <vscale x 4 x i16>
%red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %e)
ret i16 %red
}
define signext i16 @vwreduce_uadd_nxv4i8(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: vwreduce_uadd_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vwredsum.vs v8, v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vwreduce_uadd_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; RV32-NEXT: vwredsum.vs v8, v8, v9
+; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwreduce_uadd_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; RV64-NEXT: vwredsum.vs v8, v8, v9
+; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%e = sext <vscale x 4 x i8> %v to <vscale x 4 x i16>
%red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %e)
ret i16 %red
@@ -641,12 +1187,23 @@ define signext i16 @vwreduce_uadd_nxv4i8(<vscale x 4 x i8> %v) {
declare i16 @llvm.vector.reduce.umax.nxv4i16(<vscale x 4 x i16>)
define signext i16 @vreduce_umax_nxv4i16(<vscale x 4 x i16> %v) {
-; CHECK-LABEL: vreduce_umax_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vredmaxu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umax_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umax_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vredmaxu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.umax.nxv4i16(<vscale x 4 x i16> %v)
ret i16 %red
}
@@ -654,12 +1211,23 @@ define signext i16 @vreduce_umax_nxv4i16(<vscale x 4 x i16> %v) {
declare i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16>)
define signext i16 @vreduce_smax_nxv4i16(<vscale x 4 x i16> %v) {
-; CHECK-LABEL: vreduce_smax_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vredmax.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smax_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vredmax.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smax_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vredmax.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16> %v)
ret i16 %red
}
@@ -667,12 +1235,23 @@ define signext i16 @vreduce_smax_nxv4i16(<vscale x 4 x i16> %v) {
declare i16 @llvm.vector.reduce.umin.nxv4i16(<vscale x 4 x i16>)
define signext i16 @vreduce_umin_nxv4i16(<vscale x 4 x i16> %v) {
-; CHECK-LABEL: vreduce_umin_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vredminu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_umin_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vredminu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_umin_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vredminu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.umin.nxv4i16(<vscale x 4 x i16> %v)
ret i16 %red
}
@@ -680,12 +1259,23 @@ define signext i16 @vreduce_umin_nxv4i16(<vscale x 4 x i16> %v) {
declare i16 @llvm.vector.reduce.smin.nxv4i16(<vscale x 4 x i16>)
define signext i16 @vreduce_smin_nxv4i16(<vscale x 4 x i16> %v) {
-; CHECK-LABEL: vreduce_smin_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vredmin.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_smin_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vredmin.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_smin_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vredmin.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.smin.nxv4i16(<vscale x 4 x i16> %v)
ret i16 %red
}
@@ -693,12 +1283,23 @@ define signext i16 @vreduce_smin_nxv4i16(<vscale x 4 x i16> %v) {
declare i16 @llvm.vector.reduce.and.nxv4i16(<vscale x 4 x i16>)
define signext i16 @vreduce_and_nxv4i16(<vscale x 4 x i16> %v) {
-; CHECK-LABEL: vreduce_and_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vredand.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vredand.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.and.nxv4i16(<vscale x 4 x i16> %v)
ret i16 %red
}
@@ -706,12 +1307,23 @@ define signext i16 @vreduce_and_nxv4i16(<vscale x 4 x i16> %v) {
declare i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16>)
define signext i16 @vreduce_or_nxv4i16(<vscale x 4 x i16> %v) {
-; CHECK-LABEL: vreduce_or_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vredor.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vredor.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> %v)
ret i16 %red
}
@@ -719,13 +1331,25 @@ define signext i16 @vreduce_or_nxv4i16(<vscale x 4 x i16> %v) {
declare i16 @llvm.vector.reduce.xor.nxv4i16(<vscale x 4 x i16>)
define signext i16 @vreduce_xor_nxv4i16(<vscale x 4 x i16> %v) {
-; CHECK-LABEL: vreduce_xor_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vredxor.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
%red = call i16 @llvm.vector.reduce.xor.nxv4i16(<vscale x 4 x i16> %v)
ret i16 %red
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
index 41ec2fc443d028..d30bcfe2a5f25f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
@@ -864,20 +864,16 @@ define <vscale x 2 x i64> @vwsll_vi_nxv2i64_nxv2i8(<vscale x 2 x i8> %a) {
; CHECK-NEXT: vsll.vi v8, v10, 2
; CHECK-NEXT: ret
;
-; RV32ZVBB-LABEL: vwsll_vi_nxv2i64_nxv2i8:
-; RV32ZVBB: # %bb.0:
-; RV32ZVBB-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBB-NEXT: vzext.vf8 v10, v8
-; RV32ZVBB-NEXT: vsll.vi v8, v10, 2
-; RV32ZVBB-NEXT: ret
-;
-; RV64ZVBB-LABEL: vwsll_vi_nxv2i64_nxv2i8:
-; RV64ZVBB: # %bb.0:
-; RV64ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; RV64ZVBB-NEXT: vzext.vf4 v10, v8
-; RV64ZVBB-NEXT: vwsll.vi v8, v10, 2
-; RV64ZVBB-NEXT: ret
+; CHECK-ZVBB-LABEL: vwsll_vi_nxv2i64_nxv2i8:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT: vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT: vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT: ret
%x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
%z = shl <vscale x 2 x i64> %x, splat (i64 2)
ret <vscale x 2 x i64> %z
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32ZVBB: {{.*}}
+; RV64ZVBB: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll b/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll
index f7dda828856787..00720c82abf942 100644
--- a/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll
+++ b/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll
@@ -93,7 +93,8 @@ define signext i32 @srem2_32(i32 signext %0) {
; NOSFB: # %bb.0:
; NOSFB-NEXT: srliw a1, a0, 31
; NOSFB-NEXT: add a1, a1, a0
-; NOSFB-NEXT: andi a1, a1, -2
+; NOSFB-NEXT: srli a1, a1, 1
+; NOSFB-NEXT: slli a1, a1, 1
; NOSFB-NEXT: subw a0, a0, a1
; NOSFB-NEXT: ret
;
@@ -104,7 +105,8 @@ define signext i32 @srem2_32(i32 signext %0) {
; SFB-NEXT: # %bb.1:
; SFB-NEXT: addi a1, a0, 1
; SFB-NEXT: .LBB4_2:
-; SFB-NEXT: andi a1, a1, -2
+; SFB-NEXT: srli a1, a1, 1
+; SFB-NEXT: slli a1, a1, 1
; SFB-NEXT: subw a0, a0, a1
; SFB-NEXT: ret
%res = srem i32 %0, 2
@@ -116,7 +118,8 @@ define signext i32 @sremneg2_32(i32 signext %0) {
; NOSFB: # %bb.0:
; NOSFB-NEXT: srliw a1, a0, 31
; NOSFB-NEXT: add a1, a1, a0
-; NOSFB-NEXT: andi a1, a1, -2
+; NOSFB-NEXT: srli a1, a1, 1
+; NOSFB-NEXT: slli a1, a1, 1
; NOSFB-NEXT: subw a0, a0, a1
; NOSFB-NEXT: ret
;
@@ -127,7 +130,8 @@ define signext i32 @sremneg2_32(i32 signext %0) {
; SFB-NEXT: # %bb.1:
; SFB-NEXT: addi a1, a0, 1
; SFB-NEXT: .LBB5_2:
-; SFB-NEXT: andi a1, a1, -2
+; SFB-NEXT: srli a1, a1, 1
+; SFB-NEXT: slli a1, a1, 1
; SFB-NEXT: subw a0, a0, a1
; SFB-NEXT: ret
%res = srem i32 %0, -2
@@ -139,7 +143,8 @@ define i64 @srem2_64(i64 %0) {
; NOSFB: # %bb.0:
; NOSFB-NEXT: srli a1, a0, 63
; NOSFB-NEXT: add a1, a1, a0
-; NOSFB-NEXT: andi a1, a1, -2
+; NOSFB-NEXT: srai a1, a1, 1
+; NOSFB-NEXT: slli a1, a1, 1
; NOSFB-NEXT: sub a0, a0, a1
; NOSFB-NEXT: ret
;
@@ -150,7 +155,8 @@ define i64 @srem2_64(i64 %0) {
; SFB-NEXT: # %bb.1:
; SFB-NEXT: addi a1, a0, 1
; SFB-NEXT: .LBB6_2:
-; SFB-NEXT: andi a1, a1, -2
+; SFB-NEXT: srai a1, a1, 1
+; SFB-NEXT: slli a1, a1, 1
; SFB-NEXT: sub a0, a0, a1
; SFB-NEXT: ret
%res = srem i64 %0, 2
@@ -162,7 +168,8 @@ define i64 @sremneg2_64(i64 %0) {
; NOSFB: # %bb.0:
; NOSFB-NEXT: srli a1, a0, 63
; NOSFB-NEXT: add a1, a1, a0
-; NOSFB-NEXT: andi a1, a1, -2
+; NOSFB-NEXT: srai a1, a1, 1
+; NOSFB-NEXT: slli a1, a1, 1
; NOSFB-NEXT: sub a0, a0, a1
; NOSFB-NEXT: ret
;
@@ -173,7 +180,8 @@ define i64 @sremneg2_64(i64 %0) {
; SFB-NEXT: # %bb.1:
; SFB-NEXT: addi a1, a0, 1
; SFB-NEXT: .LBB7_2:
-; SFB-NEXT: andi a1, a1, -2
+; SFB-NEXT: srai a1, a1, 1
+; SFB-NEXT: slli a1, a1, 1
; SFB-NEXT: sub a0, a0, a1
; SFB-NEXT: ret
%res = srem i64 %0, -2
@@ -274,7 +282,8 @@ define signext i32 @srem8_32(i32 signext %0) {
; NOSFB-NEXT: slli a1, a0, 1
; NOSFB-NEXT: srli a1, a1, 61
; NOSFB-NEXT: add a1, a1, a0
-; NOSFB-NEXT: andi a1, a1, -8
+; NOSFB-NEXT: srli a1, a1, 3
+; NOSFB-NEXT: slli a1, a1, 3
; NOSFB-NEXT: subw a0, a0, a1
; NOSFB-NEXT: ret
;
@@ -285,7 +294,8 @@ define signext i32 @srem8_32(i32 signext %0) {
; SFB-NEXT: # %bb.1:
; SFB-NEXT: addi a1, a0, 7
; SFB-NEXT: .LBB12_2:
-; SFB-NEXT: andi a1, a1, -8
+; SFB-NEXT: srli a1, a1, 3
+; SFB-NEXT: slli a1, a1, 3
; SFB-NEXT: subw a0, a0, a1
; SFB-NEXT: ret
%res = srem i32 %0, 8
@@ -298,7 +308,8 @@ define signext i32 @sremneg8_32(i32 signext %0) {
; NOSFB-NEXT: slli a1, a0, 1
; NOSFB-NEXT: srli a1, a1, 61
; NOSFB-NEXT: add a1, a1, a0
-; NOSFB-NEXT: andi a1, a1, -8
+; NOSFB-NEXT: srli a1, a1, 3
+; NOSFB-NEXT: slli a1, a1, 3
; NOSFB-NEXT: subw a0, a0, a1
; NOSFB-NEXT: ret
;
@@ -309,7 +320,8 @@ define signext i32 @sremneg8_32(i32 signext %0) {
; SFB-NEXT: # %bb.1:
; SFB-NEXT: addi a1, a0, 7
; SFB-NEXT: .LBB13_2:
-; SFB-NEXT: andi a1, a1, -8
+; SFB-NEXT: srli a1, a1, 3
+; SFB-NEXT: slli a1, a1, 3
; SFB-NEXT: subw a0, a0, a1
; SFB-NEXT: ret
%res = srem i32 %0, -8
@@ -322,7 +334,8 @@ define i64 @srem8_64(i64 %0) {
; NOSFB-NEXT: srai a1, a0, 63
; NOSFB-NEXT: srli a1, a1, 61
; NOSFB-NEXT: add a1, a1, a0
-; NOSFB-NEXT: andi a1, a1, -8
+; NOSFB-NEXT: srai a1, a1, 3
+; NOSFB-NEXT: slli a1, a1, 3
; NOSFB-NEXT: sub a0, a0, a1
; NOSFB-NEXT: ret
;
@@ -333,7 +346,8 @@ define i64 @srem8_64(i64 %0) {
; SFB-NEXT: # %bb.1:
; SFB-NEXT: addi a1, a0, 7
; SFB-NEXT: .LBB14_2:
-; SFB-NEXT: andi a1, a1, -8
+; SFB-NEXT: srai a1, a1, 3
+; SFB-NEXT: slli a1, a1, 3
; SFB-NEXT: sub a0, a0, a1
; SFB-NEXT: ret
%res = srem i64 %0, 8
@@ -346,7 +360,8 @@ define i64 @sremneg8_64(i64 %0) {
; NOSFB-NEXT: srai a1, a0, 63
; NOSFB-NEXT: srli a1, a1, 61
; NOSFB-NEXT: add a1, a1, a0
-; NOSFB-NEXT: andi a1, a1, -8
+; NOSFB-NEXT: srai a1, a1, 3
+; NOSFB-NEXT: slli a1, a1, 3
; NOSFB-NEXT: sub a0, a0, a1
; NOSFB-NEXT: ret
;
@@ -357,7 +372,8 @@ define i64 @sremneg8_64(i64 %0) {
; SFB-NEXT: # %bb.1:
; SFB-NEXT: addi a1, a0, 7
; SFB-NEXT: .LBB15_2:
-; SFB-NEXT: andi a1, a1, -8
+; SFB-NEXT: srai a1, a1, 3
+; SFB-NEXT: slli a1, a1, 3
; SFB-NEXT: sub a0, a0, a1
; SFB-NEXT: ret
%res = srem i64 %0, -8
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index f61cbfd3ed7257..06cc5aa421bc18 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -207,25 +207,25 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: add a1, a3, a1
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a6, 3(a1)
+; RV32I-NEXT: lbu a5, 3(a1)
+; RV32I-NEXT: lbu a6, 2(a1)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a4, a5, a6
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: andi a2, a2, 7
; RV32I-NEXT: srl a3, a3, a2
; RV32I-NEXT: lbu a4, 5(a1)
; RV32I-NEXT: lbu a5, 4(a1)
-; RV32I-NEXT: lbu a6, 6(a1)
-; RV32I-NEXT: lbu a7, 7(a1)
+; RV32I-NEXT: lbu a6, 7(a1)
+; RV32I-NEXT: lbu a7, 6(a1)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a5, a6, a7
+; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
; RV32I-NEXT: xori a6, a2, 31
@@ -234,13 +234,13 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: srl a4, a4, a2
; RV32I-NEXT: lbu a5, 9(a1)
; RV32I-NEXT: lbu a7, 8(a1)
-; RV32I-NEXT: lbu t0, 10(a1)
-; RV32I-NEXT: lbu t1, 11(a1)
+; RV32I-NEXT: lbu t0, 11(a1)
+; RV32I-NEXT: lbu t1, 10(a1)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a7, t0, t1
+; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: or a5, a7, a5
; RV32I-NEXT: slli a7, a5, 1
; RV32I-NEXT: not t0, a2
@@ -249,13 +249,13 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: or a4, a4, a7
; RV32I-NEXT: lbu a7, 12(a1)
; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: lbu t0, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
+; RV32I-NEXT: lbu t0, 15(a1)
+; RV32I-NEXT: lbu a1, 14(a1)
; RV32I-NEXT: or a7, t1, a7
; RV32I-NEXT: srl a5, a5, a2
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a1, t0, a1
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a1, a1, a7
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
@@ -353,25 +353,25 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: add a1, a3, a1
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a6, 3(a1)
+; RV32I-NEXT: lbu a5, 3(a1)
+; RV32I-NEXT: lbu a6, 2(a1)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a4, a5, a6
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: andi a2, a2, 7
; RV32I-NEXT: srl a3, a3, a2
; RV32I-NEXT: lbu a4, 5(a1)
; RV32I-NEXT: lbu a5, 4(a1)
-; RV32I-NEXT: lbu a6, 6(a1)
-; RV32I-NEXT: lbu a7, 7(a1)
+; RV32I-NEXT: lbu a6, 7(a1)
+; RV32I-NEXT: lbu a7, 6(a1)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a5, a6, a7
+; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
; RV32I-NEXT: xori a6, a2, 31
@@ -380,13 +380,13 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: srl a4, a4, a2
; RV32I-NEXT: lbu a5, 9(a1)
; RV32I-NEXT: lbu a7, 8(a1)
-; RV32I-NEXT: lbu t0, 10(a1)
-; RV32I-NEXT: lbu t1, 11(a1)
+; RV32I-NEXT: lbu t0, 11(a1)
+; RV32I-NEXT: lbu t1, 10(a1)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a7, t0, t1
+; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: or a5, a7, a5
; RV32I-NEXT: slli a7, a5, 1
; RV32I-NEXT: not t0, a2
@@ -395,13 +395,13 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: or a4, a4, a7
; RV32I-NEXT: lbu a7, 12(a1)
; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: lbu t0, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
+; RV32I-NEXT: lbu t0, 15(a1)
+; RV32I-NEXT: lbu a1, 14(a1)
; RV32I-NEXT: or a7, t1, a7
; RV32I-NEXT: srl a5, a5, a2
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a1, t0, a1
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a1, a1, a7
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
@@ -495,25 +495,25 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: sub a1, a3, a1
; RV32I-NEXT: lbu a3, 5(a1)
; RV32I-NEXT: lbu a4, 4(a1)
-; RV32I-NEXT: lbu a5, 6(a1)
-; RV32I-NEXT: lbu a6, 7(a1)
+; RV32I-NEXT: lbu a5, 7(a1)
+; RV32I-NEXT: lbu a6, 6(a1)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a4, a5, a6
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: andi a2, a2, 7
; RV32I-NEXT: sll a4, a3, a2
; RV32I-NEXT: lbu a5, 1(a1)
; RV32I-NEXT: lbu a6, 0(a1)
-; RV32I-NEXT: lbu a7, 2(a1)
-; RV32I-NEXT: lbu t0, 3(a1)
+; RV32I-NEXT: lbu a7, 3(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a6, a7, t0
+; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: or a5, a6, a5
; RV32I-NEXT: srli a6, a5, 1
; RV32I-NEXT: xori a7, a2, 31
@@ -521,13 +521,13 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: or a4, a4, a6
; RV32I-NEXT: lbu a6, 9(a1)
; RV32I-NEXT: lbu t0, 8(a1)
-; RV32I-NEXT: lbu t1, 10(a1)
-; RV32I-NEXT: lbu t2, 11(a1)
+; RV32I-NEXT: lbu t1, 11(a1)
+; RV32I-NEXT: lbu t2, 10(a1)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, t0
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t1, t2
+; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: or a6, t0, a6
; RV32I-NEXT: sll t0, a6, a2
; RV32I-NEXT: srli a3, a3, 1
@@ -536,13 +536,13 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: or a3, t0, a3
; RV32I-NEXT: lbu t0, 13(a1)
; RV32I-NEXT: lbu t1, 12(a1)
-; RV32I-NEXT: lbu t2, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
+; RV32I-NEXT: lbu t2, 15(a1)
+; RV32I-NEXT: lbu a1, 14(a1)
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: or t0, t0, t1
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t2
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a1, t2, a1
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a1, a1, t0
; RV32I-NEXT: sll a1, a1, a2
; RV32I-NEXT: srli a6, a6, 1
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index 5fa802b7f27cad..3eebbc29d05124 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -13,9 +13,9 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind {
; RV32-NEXT: lui a3, 699051
; RV32-NEXT: addi a4, a3, -1365
; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 1
-; RV32-NEXT: andi a5, a5, -2
-; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: srli a5, a5, 1
+; RV32-NEXT: slli a6, a5, 1
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sub a2, a2, a5
; RV32-NEXT: sub a5, a0, a2
; RV32-NEXT: addi a3, a3, -1366
@@ -39,11 +39,11 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind {
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 1
-; RV64-NEXT: andi a4, a4, -2
+; RV64-NEXT: srli a4, a4, 1
+; RV64-NEXT: slli a5, a4, 1
; RV64-NEXT: lui a6, %hi(.LCPI0_0)
; RV64-NEXT: ld a6, %lo(.LCPI0_0)(a6)
-; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a5, a4
; RV64-NEXT: sub a2, a2, a4
; RV64-NEXT: sub a4, a0, a2
; RV64-NEXT: mul a5, a4, a6
@@ -68,9 +68,9 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
; RV32-NEXT: lui a3, 838861
; RV32-NEXT: addi a4, a3, -819
; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 2
-; RV32-NEXT: andi a5, a5, -4
-; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: srli a5, a5, 2
+; RV32-NEXT: slli a6, a5, 2
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sub a2, a2, a5
; RV32-NEXT: sub a5, a0, a2
; RV32-NEXT: addi a3, a3, -820
@@ -94,11 +94,11 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 2
-; RV64-NEXT: andi a4, a4, -4
+; RV64-NEXT: srli a4, a4, 2
+; RV64-NEXT: slli a5, a4, 2
; RV64-NEXT: lui a6, %hi(.LCPI1_0)
; RV64-NEXT: ld a6, %lo(.LCPI1_0)(a6)
-; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a5, a4
; RV64-NEXT: sub a2, a2, a4
; RV64-NEXT: sub a4, a0, a2
; RV64-NEXT: mul a5, a4, a6
@@ -236,9 +236,9 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind {
; RV32-NEXT: lui a3, 986895
; RV32-NEXT: addi a4, a3, 241
; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 4
-; RV32-NEXT: andi a5, a5, -16
-; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: srli a5, a5, 4
+; RV32-NEXT: slli a6, a5, 4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sub a2, a2, a5
; RV32-NEXT: sub a5, a0, a2
; RV32-NEXT: addi a3, a3, 240
@@ -262,11 +262,11 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind {
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 4
-; RV64-NEXT: andi a4, a4, -16
+; RV64-NEXT: srli a4, a4, 4
+; RV64-NEXT: slli a5, a4, 4
; RV64-NEXT: lui a6, %hi(.LCPI5_0)
; RV64-NEXT: ld a6, %lo(.LCPI5_0)(a6)
-; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a5, a4
; RV64-NEXT: sub a2, a2, a4
; RV64-NEXT: sub a4, a0, a2
; RV64-NEXT: mul a5, a4, a6
@@ -352,9 +352,9 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind {
; RV32-NEXT: lui a3, 1044496
; RV32-NEXT: addi a4, a3, -255
; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 8
-; RV32-NEXT: andi a5, a5, -256
-; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: srli a5, a5, 8
+; RV32-NEXT: slli a6, a5, 8
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sub a2, a2, a5
; RV32-NEXT: sub a5, a0, a2
; RV32-NEXT: addi a3, a3, -256
@@ -378,11 +378,11 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind {
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 8
-; RV64-NEXT: andi a4, a4, -256
+; RV64-NEXT: srli a4, a4, 8
+; RV64-NEXT: slli a5, a4, 8
; RV64-NEXT: lui a6, %hi(.LCPI7_0)
; RV64-NEXT: ld a6, %lo(.LCPI7_0)(a6)
-; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a5, a4
; RV64-NEXT: sub a2, a2, a4
; RV64-NEXT: sub a4, a0, a2
; RV64-NEXT: mul a5, a4, a6
@@ -472,22 +472,22 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
; RV32-NEXT: lui a3, 1048560
-; RV32-NEXT: addi a4, a3, 1
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: and a3, a5, a3
-; RV32-NEXT: srli a5, a5, 16
-; RV32-NEXT: or a3, a3, a5
-; RV32-NEXT: sub a2, a2, a3
-; RV32-NEXT: sub a3, a0, a2
-; RV32-NEXT: mulhu a4, a3, a4
-; RV32-NEXT: slli a5, a3, 16
-; RV32-NEXT: sub a4, a4, a5
+; RV32-NEXT: addi a3, a3, 1
+; RV32-NEXT: mulhu a4, a2, a3
+; RV32-NEXT: srli a4, a4, 16
+; RV32-NEXT: slli a5, a4, 16
+; RV32-NEXT: or a4, a5, a4
+; RV32-NEXT: sub a2, a2, a4
+; RV32-NEXT: sub a4, a0, a2
+; RV32-NEXT: mulhu a3, a4, a3
+; RV32-NEXT: slli a5, a4, 16
+; RV32-NEXT: sub a3, a3, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: slli a0, a1, 16
; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: add a1, a4, a1
-; RV32-NEXT: sub a0, a3, a5
+; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: sub a0, a4, a5
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_65537:
@@ -496,27 +496,27 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
; RV64-NEXT: lui a3, 1048560
-; RV64-NEXT: addiw a4, a3, 1
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: mulhu a5, a2, a4
-; RV64-NEXT: and a3, a5, a3
-; RV64-NEXT: srli a5, a5, 16
-; RV64-NEXT: add a3, a3, a5
-; RV64-NEXT: sub a2, a2, a3
-; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: addiw a3, a3, 1
+; RV64-NEXT: slli a4, a3, 32
+; RV64-NEXT: add a3, a3, a4
+; RV64-NEXT: mulhu a4, a2, a3
+; RV64-NEXT: srli a4, a4, 16
+; RV64-NEXT: slli a5, a4, 16
+; RV64-NEXT: add a4, a5, a4
+; RV64-NEXT: sub a2, a2, a4
+; RV64-NEXT: sub a4, a0, a2
; RV64-NEXT: lui a5, 983041
; RV64-NEXT: slli a5, a5, 4
; RV64-NEXT: addi a5, a5, -1
; RV64-NEXT: slli a5, a5, 16
-; RV64-NEXT: mul a5, a3, a5
-; RV64-NEXT: mulhu a6, a3, a4
+; RV64-NEXT: mul a5, a4, a5
+; RV64-NEXT: mulhu a6, a4, a3
; RV64-NEXT: add a5, a6, a5
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a4
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a3, a4
+; RV64-NEXT: mul a0, a4, a3
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 65537
ret iXLen2 %a
@@ -535,9 +535,9 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
; RV32-NEXT: lui a3, 699051
; RV32-NEXT: addi a4, a3, -1365
; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 1
-; RV32-NEXT: andi a5, a5, -2
-; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: srli a5, a5, 1
+; RV32-NEXT: slli a6, a5, 1
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sub a2, a2, a5
; RV32-NEXT: sub a5, a0, a2
; RV32-NEXT: addi a3, a3, -1366
@@ -565,11 +565,11 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 1
-; RV64-NEXT: andi a4, a4, -2
+; RV64-NEXT: srli a4, a4, 1
+; RV64-NEXT: slli a5, a4, 1
; RV64-NEXT: lui a6, %hi(.LCPI10_0)
; RV64-NEXT: ld a6, %lo(.LCPI10_0)(a6)
-; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a5, a4
; RV64-NEXT: sub a2, a2, a4
; RV64-NEXT: sub a4, a0, a2
; RV64-NEXT: mul a5, a4, a6
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index 8444520fcc7718..14dec170daac89 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -13,9 +13,9 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind {
; RV32-NEXT: lui a1, 699051
; RV32-NEXT: addi a1, a1, -1365
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 1
-; RV32-NEXT: andi a1, a1, -2
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -30,9 +30,9 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind {
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 1
-; RV64-NEXT: andi a1, a1, -2
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: srli a1, a1, 1
+; RV64-NEXT: slli a2, a1, 1
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -49,9 +49,9 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
; RV32-NEXT: lui a1, 838861
; RV32-NEXT: addi a1, a1, -819
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 2
-; RV32-NEXT: andi a1, a1, -4
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -66,9 +66,9 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 2
-; RV64-NEXT: andi a1, a1, -4
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: srli a1, a1, 2
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -173,9 +173,9 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind {
; RV32-NEXT: lui a1, 986895
; RV32-NEXT: addi a1, a1, 241
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 4
-; RV32-NEXT: andi a1, a1, -16
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: srli a1, a1, 4
+; RV32-NEXT: slli a2, a1, 4
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -190,9 +190,9 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind {
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 4
-; RV64-NEXT: andi a1, a1, -16
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: srli a1, a1, 4
+; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -245,9 +245,9 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind {
; RV32-NEXT: lui a1, 1044496
; RV32-NEXT: addi a1, a1, -255
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 8
-; RV32-NEXT: andi a1, a1, -256
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: srli a1, a1, 8
+; RV32-NEXT: slli a2, a1, 8
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -262,9 +262,9 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind {
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 8
-; RV64-NEXT: andi a1, a1, -256
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: srli a1, a1, 8
+; RV64-NEXT: slli a2, a1, 8
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -315,11 +315,11 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
; RV32-NEXT: lui a1, 1048560
-; RV32-NEXT: addi a2, a1, 1
-; RV32-NEXT: mulhu a2, a0, a2
-; RV32-NEXT: and a1, a2, a1
-; RV32-NEXT: srli a2, a2, 16
-; RV32-NEXT: or a1, a1, a2
+; RV32-NEXT: addi a1, a1, 1
+; RV32-NEXT: mulhu a1, a0, a1
+; RV32-NEXT: srli a1, a1, 16
+; RV32-NEXT: slli a2, a1, 16
+; RV32-NEXT: or a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -330,13 +330,13 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
; RV64-NEXT: lui a1, 1048560
-; RV64-NEXT: addiw a2, a1, 1
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: mulhu a2, a0, a2
-; RV64-NEXT: and a1, a2, a1
-; RV64-NEXT: srli a2, a2, 16
+; RV64-NEXT: addiw a1, a1, 1
+; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: mulhu a1, a0, a1
+; RV64-NEXT: srli a1, a1, 16
+; RV64-NEXT: slli a2, a1, 16
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -357,9 +357,9 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
; RV32-NEXT: lui a2, 699051
; RV32-NEXT: addi a2, a2, -1365
; RV32-NEXT: mulhu a2, a1, a2
-; RV32-NEXT: srli a3, a2, 1
-; RV32-NEXT: andi a2, a2, -2
-; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: srli a2, a2, 1
+; RV32-NEXT: slli a3, a2, 1
+; RV32-NEXT: add a2, a3, a2
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: slli a1, a1, 2
; RV32-NEXT: andi a0, a0, 3
@@ -381,9 +381,9 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
; RV64-NEXT: slli a3, a2, 32
; RV64-NEXT: add a2, a2, a3
; RV64-NEXT: mulhu a2, a1, a2
-; RV64-NEXT: srli a3, a2, 1
-; RV64-NEXT: andi a2, a2, -2
-; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: srli a2, a2, 1
+; RV64-NEXT: slli a3, a2, 1
+; RV64-NEXT: add a2, a3, a2
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: slli a1, a1, 2
; RV64-NEXT: andi a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/srem-lkk.ll b/llvm/test/CodeGen/RISCV/srem-lkk.ll
index 7c291bbceedc6d..f88949c7f3a7b0 100644
--- a/llvm/test/CodeGen/RISCV/srem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-lkk.ll
@@ -294,7 +294,8 @@ define i32 @dont_fold_srem_power_of_two(i32 %x) nounwind {
; RV32I-NEXT: srai a1, a0, 31
; RV32I-NEXT: srli a1, a1, 26
; RV32I-NEXT: add a1, a0, a1
-; RV32I-NEXT: andi a1, a1, -64
+; RV32I-NEXT: srai a1, a1, 6
+; RV32I-NEXT: slli a1, a1, 6
; RV32I-NEXT: sub a0, a0, a1
; RV32I-NEXT: ret
;
@@ -303,7 +304,8 @@ define i32 @dont_fold_srem_power_of_two(i32 %x) nounwind {
; RV32IM-NEXT: srai a1, a0, 31
; RV32IM-NEXT: srli a1, a1, 26
; RV32IM-NEXT: add a1, a0, a1
-; RV32IM-NEXT: andi a1, a1, -64
+; RV32IM-NEXT: srai a1, a1, 6
+; RV32IM-NEXT: slli a1, a1, 6
; RV32IM-NEXT: sub a0, a0, a1
; RV32IM-NEXT: ret
;
@@ -312,7 +314,8 @@ define i32 @dont_fold_srem_power_of_two(i32 %x) nounwind {
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: srliw a1, a1, 26
; RV64I-NEXT: add a1, a0, a1
-; RV64I-NEXT: andi a1, a1, -64
+; RV64I-NEXT: srli a1, a1, 6
+; RV64I-NEXT: slli a1, a1, 6
; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: ret
;
@@ -321,7 +324,8 @@ define i32 @dont_fold_srem_power_of_two(i32 %x) nounwind {
; RV64IM-NEXT: sraiw a1, a0, 31
; RV64IM-NEXT: srliw a1, a1, 26
; RV64IM-NEXT: add a1, a0, a1
-; RV64IM-NEXT: andi a1, a1, -64
+; RV64IM-NEXT: srli a1, a1, 6
+; RV64IM-NEXT: slli a1, a1, 6
; RV64IM-NEXT: subw a0, a0, a1
; RV64IM-NEXT: ret
%1 = srem i32 %x, 64
@@ -345,8 +349,8 @@ define i32 @dont_fold_srem_i32_smax(i32 %x) nounwind {
; RV32I-NEXT: srai a1, a0, 31
; RV32I-NEXT: srli a1, a1, 1
; RV32I-NEXT: add a1, a0, a1
-; RV32I-NEXT: lui a2, 524288
-; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: srli a1, a1, 31
+; RV32I-NEXT: slli a1, a1, 31
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -355,8 +359,8 @@ define i32 @dont_fold_srem_i32_smax(i32 %x) nounwind {
; RV32IM-NEXT: srai a1, a0, 31
; RV32IM-NEXT: srli a1, a1, 1
; RV32IM-NEXT: add a1, a0, a1
-; RV32IM-NEXT: lui a2, 524288
-; RV32IM-NEXT: and a1, a1, a2
+; RV32IM-NEXT: srli a1, a1, 31
+; RV32IM-NEXT: slli a1, a1, 31
; RV32IM-NEXT: add a0, a0, a1
; RV32IM-NEXT: ret
;
@@ -365,8 +369,8 @@ define i32 @dont_fold_srem_i32_smax(i32 %x) nounwind {
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: srliw a1, a1, 1
; RV64I-NEXT: add a1, a0, a1
-; RV64I-NEXT: lui a2, 524288
-; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: srli a1, a1, 31
+; RV64I-NEXT: slli a1, a1, 31
; RV64I-NEXT: addw a0, a0, a1
; RV64I-NEXT: ret
;
@@ -375,8 +379,8 @@ define i32 @dont_fold_srem_i32_smax(i32 %x) nounwind {
; RV64IM-NEXT: sraiw a1, a0, 31
; RV64IM-NEXT: srliw a1, a1, 1
; RV64IM-NEXT: add a1, a0, a1
-; RV64IM-NEXT: lui a2, 524288
-; RV64IM-NEXT: and a1, a1, a2
+; RV64IM-NEXT: srli a1, a1, 31
+; RV64IM-NEXT: slli a1, a1, 31
; RV64IM-NEXT: addw a0, a0, a1
; RV64IM-NEXT: ret
%1 = srem i32 %x, 2147483648
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 9ecfa501783167..6be76e134d6243 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -144,10 +144,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
; RV32M-NEXT: srai a1, a1, 28
; RV32M-NEXT: slli a2, a1, 1
; RV32M-NEXT: add a1, a2, a1
-; RV32M-NEXT: srli a2, a1, 4
-; RV32M-NEXT: slli a1, a1, 24
-; RV32M-NEXT: srli a1, a1, 31
-; RV32M-NEXT: add a1, a2, a1
+; RV32M-NEXT: srli a1, a1, 4
+; RV32M-NEXT: slli a2, a1, 28
+; RV32M-NEXT: srli a2, a2, 31
+; RV32M-NEXT: add a1, a1, a2
; RV32M-NEXT: li a2, 6
; RV32M-NEXT: mul a1, a1, a2
; RV32M-NEXT: sub a0, a0, a1
@@ -162,10 +162,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
; RV64M-NEXT: srai a1, a1, 60
; RV64M-NEXT: slli a2, a1, 1
; RV64M-NEXT: add a1, a2, a1
-; RV64M-NEXT: srli a2, a1, 4
-; RV64M-NEXT: slli a1, a1, 56
-; RV64M-NEXT: srli a1, a1, 63
-; RV64M-NEXT: add a1, a2, a1
+; RV64M-NEXT: srli a1, a1, 4
+; RV64M-NEXT: slli a2, a1, 60
+; RV64M-NEXT: srli a2, a2, 63
+; RV64M-NEXT: add a1, a1, a2
; RV64M-NEXT: li a2, 6
; RV64M-NEXT: mul a1, a1, a2
; RV64M-NEXT: subw a0, a0, a1
@@ -180,10 +180,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
; RV32MV-NEXT: srai a1, a1, 28
; RV32MV-NEXT: slli a2, a1, 1
; RV32MV-NEXT: add a1, a2, a1
-; RV32MV-NEXT: srli a2, a1, 4
-; RV32MV-NEXT: slli a1, a1, 24
-; RV32MV-NEXT: srli a1, a1, 31
-; RV32MV-NEXT: add a1, a2, a1
+; RV32MV-NEXT: srli a1, a1, 4
+; RV32MV-NEXT: slli a2, a1, 28
+; RV32MV-NEXT: srli a2, a2, 31
+; RV32MV-NEXT: add a1, a1, a2
; RV32MV-NEXT: li a2, 6
; RV32MV-NEXT: mul a1, a1, a2
; RV32MV-NEXT: sub a0, a0, a1
@@ -198,10 +198,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
; RV64MV-NEXT: srai a1, a1, 60
; RV64MV-NEXT: slli a2, a1, 1
; RV64MV-NEXT: add a1, a2, a1
-; RV64MV-NEXT: srli a2, a1, 4
-; RV64MV-NEXT: slli a1, a1, 56
-; RV64MV-NEXT: srli a1, a1, 63
-; RV64MV-NEXT: add a1, a2, a1
+; RV64MV-NEXT: srli a1, a1, 4
+; RV64MV-NEXT: slli a2, a1, 60
+; RV64MV-NEXT: srli a2, a2, 63
+; RV64MV-NEXT: add a1, a1, a2
; RV64MV-NEXT: li a2, 6
; RV64MV-NEXT: mul a1, a1, a2
; RV64MV-NEXT: subw a0, a0, a1
@@ -219,10 +219,12 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: slli a1, a0, 26
; RV32-NEXT: srai a1, a1, 26
-; RV32-NEXT: slli a1, a1, 21
+; RV32-NEXT: srli a1, a1, 5
+; RV32-NEXT: slli a1, a1, 26
; RV32-NEXT: srli a1, a1, 30
; RV32-NEXT: add a1, a0, a1
-; RV32-NEXT: andi a1, a1, 60
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: slli a1, a1, 2
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: andi a0, a0, 63
; RV32-NEXT: snez a0, a0
@@ -232,10 +234,12 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: slli a1, a0, 58
; RV64-NEXT: srai a1, a1, 58
-; RV64-NEXT: slli a1, a1, 53
+; RV64-NEXT: srli a1, a1, 5
+; RV64-NEXT: slli a1, a1, 58
; RV64-NEXT: srli a1, a1, 62
; RV64-NEXT: add a1, a0, a1
-; RV64-NEXT: andi a1, a1, 60
+; RV64-NEXT: srli a1, a1, 2
+; RV64-NEXT: slli a1, a1, 2
; RV64-NEXT: subw a0, a0, a1
; RV64-NEXT: andi a0, a0, 63
; RV64-NEXT: snez a0, a0
@@ -245,10 +249,12 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; RV32M: # %bb.0:
; RV32M-NEXT: slli a1, a0, 26
; RV32M-NEXT: srai a1, a1, 26
-; RV32M-NEXT: slli a1, a1, 21
+; RV32M-NEXT: srli a1, a1, 5
+; RV32M-NEXT: slli a1, a1, 26
; RV32M-NEXT: srli a1, a1, 30
; RV32M-NEXT: add a1, a0, a1
-; RV32M-NEXT: andi a1, a1, 60
+; RV32M-NEXT: srli a1, a1, 2
+; RV32M-NEXT: slli a1, a1, 2
; RV32M-NEXT: sub a0, a0, a1
; RV32M-NEXT: andi a0, a0, 63
; RV32M-NEXT: snez a0, a0
@@ -258,10 +264,12 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; RV64M: # %bb.0:
; RV64M-NEXT: slli a1, a0, 58
; RV64M-NEXT: srai a1, a1, 58
-; RV64M-NEXT: slli a1, a1, 53
+; RV64M-NEXT: srli a1, a1, 5
+; RV64M-NEXT: slli a1, a1, 58
; RV64M-NEXT: srli a1, a1, 62
; RV64M-NEXT: add a1, a0, a1
-; RV64M-NEXT: andi a1, a1, 60
+; RV64M-NEXT: srli a1, a1, 2
+; RV64M-NEXT: slli a1, a1, 2
; RV64M-NEXT: subw a0, a0, a1
; RV64M-NEXT: andi a0, a0, 63
; RV64M-NEXT: snez a0, a0
@@ -271,10 +279,12 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; RV32MV: # %bb.0:
; RV32MV-NEXT: slli a1, a0, 26
; RV32MV-NEXT: srai a1, a1, 26
-; RV32MV-NEXT: slli a1, a1, 21
+; RV32MV-NEXT: srli a1, a1, 5
+; RV32MV-NEXT: slli a1, a1, 26
; RV32MV-NEXT: srli a1, a1, 30
; RV32MV-NEXT: add a1, a0, a1
-; RV32MV-NEXT: andi a1, a1, 60
+; RV32MV-NEXT: srli a1, a1, 2
+; RV32MV-NEXT: slli a1, a1, 2
; RV32MV-NEXT: sub a0, a0, a1
; RV32MV-NEXT: andi a0, a0, 63
; RV32MV-NEXT: snez a0, a0
@@ -284,10 +294,12 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; RV64MV: # %bb.0:
; RV64MV-NEXT: slli a1, a0, 58
; RV64MV-NEXT: srai a1, a1, 58
-; RV64MV-NEXT: slli a1, a1, 53
+; RV64MV-NEXT: srli a1, a1, 5
+; RV64MV-NEXT: slli a1, a1, 58
; RV64MV-NEXT: srli a1, a1, 62
; RV64MV-NEXT: add a1, a0, a1
-; RV64MV-NEXT: andi a1, a1, 60
+; RV64MV-NEXT: srli a1, a1, 2
+; RV64MV-NEXT: slli a1, a1, 2
; RV64MV-NEXT: subw a0, a0, a1
; RV64MV-NEXT: andi a0, a0, 63
; RV64MV-NEXT: snez a0, a0
@@ -392,12 +404,14 @@ define void @test_srem_vec(ptr %X) nounwind {
; RV64-NEXT: lbu a0, 12(a0)
; RV64-NEXT: lwu a1, 8(s0)
; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: ld a2, 0(s0)
; RV64-NEXT: or a0, a1, a0
-; RV64-NEXT: slli a0, a0, 29
+; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: ld a2, 0(s0)
+; RV64-NEXT: slli a0, a0, 31
; RV64-NEXT: srai s1, a0, 31
-; RV64-NEXT: srli a0, a2, 2
; RV64-NEXT: slli a1, a1, 62
+; RV64-NEXT: srli a0, a2, 33
+; RV64-NEXT: slli a0, a0, 31
; RV64-NEXT: or a0, a1, a0
; RV64-NEXT: srai a0, a0, 31
; RV64-NEXT: slli a2, a2, 31
@@ -534,69 +548,69 @@ define void @test_srem_vec(ptr %X) nounwind {
;
; RV64M-LABEL: test_srem_vec:
; RV64M: # %bb.0:
-; RV64M-NEXT: ld a1, 0(a0)
-; RV64M-NEXT: lwu a2, 8(a0)
-; RV64M-NEXT: srli a3, a1, 2
-; RV64M-NEXT: lbu a4, 12(a0)
-; RV64M-NEXT: slli a5, a2, 62
-; RV64M-NEXT: or a3, a5, a3
-; RV64M-NEXT: srai a3, a3, 31
-; RV64M-NEXT: slli a4, a4, 32
-; RV64M-NEXT: or a2, a2, a4
-; RV64M-NEXT: slli a2, a2, 29
-; RV64M-NEXT: lui a4, %hi(.LCPI3_0)
-; RV64M-NEXT: ld a4, %lo(.LCPI3_0)(a4)
+; RV64M-NEXT: lwu a1, 8(a0)
+; RV64M-NEXT: lbu a2, 12(a0)
+; RV64M-NEXT: slli a3, a1, 29
+; RV64M-NEXT: ld a4, 0(a0)
+; RV64M-NEXT: slli a2, a2, 61
+; RV64M-NEXT: or a2, a2, a3
; RV64M-NEXT: srai a2, a2, 31
-; RV64M-NEXT: slli a1, a1, 31
+; RV64M-NEXT: srli a3, a4, 2
+; RV64M-NEXT: slli a1, a1, 62
+; RV64M-NEXT: or a1, a1, a3
+; RV64M-NEXT: lui a3, %hi(.LCPI3_0)
+; RV64M-NEXT: ld a3, %lo(.LCPI3_0)(a3)
; RV64M-NEXT: srai a1, a1, 31
-; RV64M-NEXT: mulh a4, a2, a4
-; RV64M-NEXT: srli a5, a4, 63
-; RV64M-NEXT: srai a4, a4, 1
-; RV64M-NEXT: add a4, a4, a5
+; RV64M-NEXT: slli a4, a4, 31
+; RV64M-NEXT: srai a4, a4, 31
+; RV64M-NEXT: mulh a3, a1, a3
+; RV64M-NEXT: srli a5, a3, 63
+; RV64M-NEXT: srai a3, a3, 1
+; RV64M-NEXT: add a3, a3, a5
; RV64M-NEXT: lui a5, %hi(.LCPI3_1)
; RV64M-NEXT: ld a5, %lo(.LCPI3_1)(a5)
-; RV64M-NEXT: add a2, a2, a4
-; RV64M-NEXT: slli a4, a4, 2
-; RV64M-NEXT: add a2, a2, a4
-; RV64M-NEXT: mulh a4, a3, a5
-; RV64M-NEXT: srli a5, a4, 63
-; RV64M-NEXT: srai a4, a4, 1
-; RV64M-NEXT: add a4, a4, a5
-; RV64M-NEXT: slli a5, a4, 3
-; RV64M-NEXT: add a3, a3, a4
-; RV64M-NEXT: sub a3, a3, a5
-; RV64M-NEXT: addi a3, a3, -1
-; RV64M-NEXT: seqz a3, a3
-; RV64M-NEXT: lui a4, 699051
-; RV64M-NEXT: addiw a4, a4, -1365
-; RV64M-NEXT: slli a5, a4, 32
-; RV64M-NEXT: add a4, a4, a5
-; RV64M-NEXT: lui a5, %hi(.LCPI3_2)
-; RV64M-NEXT: ld a5, %lo(.LCPI3_2)(a5)
+; RV64M-NEXT: add a1, a1, a3
+; RV64M-NEXT: slli a3, a3, 3
+; RV64M-NEXT: sub a1, a1, a3
+; RV64M-NEXT: mulh a3, a2, a5
+; RV64M-NEXT: srli a5, a3, 63
+; RV64M-NEXT: srai a3, a3, 1
+; RV64M-NEXT: add a3, a3, a5
+; RV64M-NEXT: slli a5, a3, 2
+; RV64M-NEXT: add a2, a2, a3
+; RV64M-NEXT: add a2, a2, a5
; RV64M-NEXT: addi a2, a2, -2
; RV64M-NEXT: seqz a2, a2
-; RV64M-NEXT: mul a1, a1, a4
-; RV64M-NEXT: add a1, a1, a5
-; RV64M-NEXT: slli a4, a1, 63
-; RV64M-NEXT: srli a1, a1, 1
-; RV64M-NEXT: or a1, a1, a4
-; RV64M-NEXT: sltu a1, a5, a1
+; RV64M-NEXT: lui a3, 699051
+; RV64M-NEXT: addiw a3, a3, -1365
+; RV64M-NEXT: slli a5, a3, 32
+; RV64M-NEXT: add a3, a3, a5
+; RV64M-NEXT: lui a5, %hi(.LCPI3_2)
+; RV64M-NEXT: ld a5, %lo(.LCPI3_2)(a5)
+; RV64M-NEXT: addi a1, a1, -1
+; RV64M-NEXT: seqz a1, a1
+; RV64M-NEXT: mul a3, a4, a3
+; RV64M-NEXT: add a3, a3, a5
+; RV64M-NEXT: slli a4, a3, 63
+; RV64M-NEXT: srli a3, a3, 1
+; RV64M-NEXT: or a3, a3, a4
+; RV64M-NEXT: sltu a3, a5, a3
+; RV64M-NEXT: addi a1, a1, -1
; RV64M-NEXT: addi a2, a2, -1
-; RV64M-NEXT: addi a3, a3, -1
-; RV64M-NEXT: neg a1, a1
-; RV64M-NEXT: slli a4, a3, 33
-; RV64M-NEXT: slli a1, a1, 31
-; RV64M-NEXT: srli a1, a1, 31
-; RV64M-NEXT: or a1, a1, a4
-; RV64M-NEXT: sd a1, 0(a0)
-; RV64M-NEXT: slli a1, a2, 2
+; RV64M-NEXT: neg a3, a3
+; RV64M-NEXT: slli a4, a2, 29
+; RV64M-NEXT: srli a4, a4, 61
+; RV64M-NEXT: sb a4, 12(a0)
+; RV64M-NEXT: slli a4, a1, 33
; RV64M-NEXT: slli a3, a3, 31
-; RV64M-NEXT: srli a3, a3, 62
-; RV64M-NEXT: or a1, a3, a1
+; RV64M-NEXT: srli a3, a3, 31
+; RV64M-NEXT: or a3, a3, a4
+; RV64M-NEXT: sd a3, 0(a0)
+; RV64M-NEXT: slli a2, a2, 2
+; RV64M-NEXT: slli a1, a1, 31
+; RV64M-NEXT: srli a1, a1, 62
+; RV64M-NEXT: or a1, a1, a2
; RV64M-NEXT: sw a1, 8(a0)
-; RV64M-NEXT: slli a2, a2, 29
-; RV64M-NEXT: srli a2, a2, 61
-; RV64M-NEXT: sb a2, 12(a0)
; RV64M-NEXT: ret
;
; RV32MV-LABEL: test_srem_vec:
@@ -728,13 +742,13 @@ define void @test_srem_vec(ptr %X) nounwind {
; RV64MV-NEXT: ld a1, 0(a0)
; RV64MV-NEXT: lwu a2, 8(a0)
; RV64MV-NEXT: srli a3, a1, 2
-; RV64MV-NEXT: lbu a4, 12(a0)
-; RV64MV-NEXT: slli a5, a2, 62
-; RV64MV-NEXT: or a3, a5, a3
+; RV64MV-NEXT: slli a4, a2, 62
+; RV64MV-NEXT: lbu a5, 12(a0)
+; RV64MV-NEXT: or a3, a4, a3
; RV64MV-NEXT: srai a3, a3, 31
-; RV64MV-NEXT: slli a4, a4, 32
-; RV64MV-NEXT: or a2, a2, a4
; RV64MV-NEXT: slli a2, a2, 29
+; RV64MV-NEXT: slli a5, a5, 61
+; RV64MV-NEXT: or a2, a5, a2
; RV64MV-NEXT: lui a4, %hi(.LCPI3_0)
; RV64MV-NEXT: ld a4, %lo(.LCPI3_0)(a4)
; RV64MV-NEXT: srai a2, a2, 31
@@ -779,25 +793,25 @@ define void @test_srem_vec(ptr %X) nounwind {
; RV64MV-NEXT: vmsne.vv v0, v8, v12
; RV64MV-NEXT: vmv.v.i v8, 0
; RV64MV-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64MV-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64MV-NEXT: vslidedown.vi v10, v8, 2
-; RV64MV-NEXT: vmv.x.s a2, v10
-; RV64MV-NEXT: slli a3, a2, 31
-; RV64MV-NEXT: srli a3, a3, 61
-; RV64MV-NEXT: sb a3, 12(a0)
-; RV64MV-NEXT: vmv.x.s a3, v8
-; RV64MV-NEXT: and a1, a3, a1
+; RV64MV-NEXT: vmv.x.s a2, v8
+; RV64MV-NEXT: and a1, a2, a1
; RV64MV-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64MV-NEXT: vslidedown.vi v8, v8, 1
-; RV64MV-NEXT: vmv.x.s a3, v8
-; RV64MV-NEXT: slli a4, a3, 33
-; RV64MV-NEXT: or a1, a1, a4
+; RV64MV-NEXT: vslidedown.vi v10, v8, 1
+; RV64MV-NEXT: vmv.x.s a2, v10
+; RV64MV-NEXT: slli a3, a2, 33
+; RV64MV-NEXT: or a1, a1, a3
; RV64MV-NEXT: sd a1, 0(a0)
-; RV64MV-NEXT: slli a2, a2, 2
-; RV64MV-NEXT: slli a3, a3, 31
-; RV64MV-NEXT: srli a3, a3, 62
-; RV64MV-NEXT: or a2, a3, a2
+; RV64MV-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV64MV-NEXT: vslidedown.vi v8, v8, 2
+; RV64MV-NEXT: vmv.x.s a1, v8
+; RV64MV-NEXT: slli a1, a1, 2
+; RV64MV-NEXT: slli a2, a2, 31
+; RV64MV-NEXT: srli a2, a2, 62
+; RV64MV-NEXT: or a2, a2, a1
; RV64MV-NEXT: sw a2, 8(a0)
+; RV64MV-NEXT: slli a1, a1, 29
+; RV64MV-NEXT: srli a1, a1, 61
+; RV64MV-NEXT: sb a1, 12(a0)
; RV64MV-NEXT: ret
%ld = load <3 x i33>, ptr %X
%srem = srem <3 x i33> %ld, <i33 6, i33 7, i33 -5>
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index 7fc4713ac2d6e1..c38ffc20e07c49 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -629,15 +629,18 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
; RV32I-NEXT: lh a1, 4(a1)
; RV32I-NEXT: srli a4, a2, 26
; RV32I-NEXT: add a4, a2, a4
-; RV32I-NEXT: andi a4, a4, -64
+; RV32I-NEXT: srli a4, a4, 6
+; RV32I-NEXT: slli a4, a4, 6
; RV32I-NEXT: sub s1, a2, a4
; RV32I-NEXT: srli a2, a1, 27
; RV32I-NEXT: add a2, a1, a2
-; RV32I-NEXT: andi a2, a2, -32
+; RV32I-NEXT: srli a2, a2, 5
+; RV32I-NEXT: slli a2, a2, 5
; RV32I-NEXT: sub s2, a1, a2
; RV32I-NEXT: srli a1, a3, 29
; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: andi a1, a1, -8
+; RV32I-NEXT: srli a1, a1, 3
+; RV32I-NEXT: slli a1, a1, 3
; RV32I-NEXT: sub s3, a3, a1
; RV32I-NEXT: li a1, 95
; RV32I-NEXT: call __modsi3
@@ -671,15 +674,18 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
; RV32IM-NEXT: sub a4, a4, a5
; RV32IM-NEXT: srli a5, a1, 26
; RV32IM-NEXT: add a5, a1, a5
-; RV32IM-NEXT: andi a5, a5, -64
+; RV32IM-NEXT: srli a5, a5, 6
+; RV32IM-NEXT: slli a5, a5, 6
; RV32IM-NEXT: sub a1, a1, a5
; RV32IM-NEXT: srli a5, a3, 27
; RV32IM-NEXT: add a5, a3, a5
-; RV32IM-NEXT: andi a5, a5, -32
+; RV32IM-NEXT: srli a5, a5, 5
+; RV32IM-NEXT: slli a5, a5, 5
; RV32IM-NEXT: sub a3, a3, a5
; RV32IM-NEXT: srli a5, a2, 29
; RV32IM-NEXT: add a5, a2, a5
-; RV32IM-NEXT: andi a5, a5, -8
+; RV32IM-NEXT: srli a5, a5, 3
+; RV32IM-NEXT: slli a5, a5, 3
; RV32IM-NEXT: sub a2, a2, a5
; RV32IM-NEXT: sh a2, 4(a0)
; RV32IM-NEXT: sh a3, 2(a0)
@@ -702,15 +708,18 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
; RV64I-NEXT: lh a1, 8(a1)
; RV64I-NEXT: srli a4, a2, 58
; RV64I-NEXT: add a4, a2, a4
-; RV64I-NEXT: andi a4, a4, -64
+; RV64I-NEXT: srli a4, a4, 6
+; RV64I-NEXT: slli a4, a4, 6
; RV64I-NEXT: subw s1, a2, a4
; RV64I-NEXT: srli a2, a1, 59
; RV64I-NEXT: add a2, a1, a2
-; RV64I-NEXT: andi a2, a2, -32
+; RV64I-NEXT: srli a2, a2, 5
+; RV64I-NEXT: slli a2, a2, 5
; RV64I-NEXT: subw s2, a1, a2
; RV64I-NEXT: srli a1, a3, 61
; RV64I-NEXT: add a1, a3, a1
-; RV64I-NEXT: andi a1, a1, -8
+; RV64I-NEXT: srli a1, a1, 3
+; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: subw s3, a3, a1
; RV64I-NEXT: li a1, 95
; RV64I-NEXT: call __moddi3
@@ -744,15 +753,18 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
; RV64IM-NEXT: subw a2, a2, a3
; RV64IM-NEXT: srli a3, a1, 58
; RV64IM-NEXT: add a3, a1, a3
-; RV64IM-NEXT: andi a3, a3, -64
+; RV64IM-NEXT: srli a3, a3, 6
+; RV64IM-NEXT: slli a3, a3, 6
; RV64IM-NEXT: subw a1, a1, a3
; RV64IM-NEXT: srli a3, a5, 59
; RV64IM-NEXT: add a3, a5, a3
-; RV64IM-NEXT: andi a3, a3, -32
+; RV64IM-NEXT: srli a3, a3, 5
+; RV64IM-NEXT: slli a3, a3, 5
; RV64IM-NEXT: subw a5, a5, a3
; RV64IM-NEXT: srli a3, a4, 61
; RV64IM-NEXT: add a3, a4, a3
-; RV64IM-NEXT: andi a3, a3, -8
+; RV64IM-NEXT: srli a3, a3, 3
+; RV64IM-NEXT: slli a3, a3, 3
; RV64IM-NEXT: subw a4, a4, a3
; RV64IM-NEXT: sh a4, 4(a0)
; RV64IM-NEXT: sh a5, 2(a0)
@@ -937,8 +949,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
; RV32I-NEXT: lh a0, 8(a1)
; RV32I-NEXT: srli a1, a2, 17
; RV32I-NEXT: add a1, a2, a1
-; RV32I-NEXT: lui a3, 8
-; RV32I-NEXT: and a1, a1, a3
+; RV32I-NEXT: srli a1, a1, 15
+; RV32I-NEXT: slli a1, a1, 15
; RV32I-NEXT: sub s3, a2, a1
; RV32I-NEXT: li a1, 23
; RV32I-NEXT: call __modsi3
@@ -986,12 +998,12 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
; RV32IM-NEXT: sub a1, a1, a4
; RV32IM-NEXT: srli a4, a2, 17
; RV32IM-NEXT: add a4, a2, a4
-; RV32IM-NEXT: lui a5, 8
-; RV32IM-NEXT: and a4, a4, a5
+; RV32IM-NEXT: srli a4, a4, 15
+; RV32IM-NEXT: slli a4, a4, 15
; RV32IM-NEXT: sub a2, a2, a4
; RV32IM-NEXT: sh zero, 0(a0)
-; RV32IM-NEXT: sh a2, 2(a0)
; RV32IM-NEXT: sh a1, 6(a0)
+; RV32IM-NEXT: sh a2, 2(a0)
; RV32IM-NEXT: sh a3, 4(a0)
; RV32IM-NEXT: ret
;
@@ -1009,8 +1021,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
; RV64I-NEXT: lh a0, 16(a1)
; RV64I-NEXT: srli a1, a2, 49
; RV64I-NEXT: add a1, a2, a1
-; RV64I-NEXT: lui a3, 8
-; RV64I-NEXT: and a1, a1, a3
+; RV64I-NEXT: srli a1, a1, 15
+; RV64I-NEXT: slli a1, a1, 15
; RV64I-NEXT: subw s3, a2, a1
; RV64I-NEXT: li a1, 23
; RV64I-NEXT: call __moddi3
@@ -1058,12 +1070,12 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
; RV64IM-NEXT: subw a4, a4, a3
; RV64IM-NEXT: srli a3, a1, 49
; RV64IM-NEXT: add a3, a1, a3
-; RV64IM-NEXT: lui a5, 8
-; RV64IM-NEXT: and a3, a3, a5
+; RV64IM-NEXT: srli a3, a3, 15
+; RV64IM-NEXT: slli a3, a3, 15
; RV64IM-NEXT: subw a1, a1, a3
; RV64IM-NEXT: sh zero, 0(a0)
-; RV64IM-NEXT: sh a1, 2(a0)
; RV64IM-NEXT: sh a4, 6(a0)
+; RV64IM-NEXT: sh a1, 2(a0)
; RV64IM-NEXT: sh a2, 4(a0)
; RV64IM-NEXT: ret
%1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423>
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index ce0d8fedbfb88f..736c64f4b76051 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -60,19 +60,33 @@ define i24 @load_i24(ptr %p) {
}
define i32 @load_i32(ptr %p) {
-; SLOW-LABEL: load_i32:
-; SLOW: # %bb.0:
-; SLOW-NEXT: lbu a1, 1(a0)
-; SLOW-NEXT: lbu a2, 0(a0)
-; SLOW-NEXT: lbu a3, 2(a0)
-; SLOW-NEXT: lbu a0, 3(a0)
-; SLOW-NEXT: slli a1, a1, 8
-; SLOW-NEXT: or a1, a1, a2
-; SLOW-NEXT: slli a3, a3, 16
-; SLOW-NEXT: slli a0, a0, 24
-; SLOW-NEXT: or a0, a0, a3
-; SLOW-NEXT: or a0, a0, a1
-; SLOW-NEXT: ret
+; RV32I-LABEL: load_i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a1, 1(a0)
+; RV32I-NEXT: lbu a2, 0(a0)
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: load_i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a1, 1(a0)
+; RV64I-NEXT: lbu a2, 0(a0)
+; RV64I-NEXT: lbu a3, 2(a0)
+; RV64I-NEXT: lbu a0, 3(a0)
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a2
+; RV64I-NEXT: slli a3, a3, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: ret
;
; FAST-LABEL: load_i32:
; FAST: # %bb.0:
@@ -87,23 +101,23 @@ define i64 @load_i64(ptr %p) {
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a1, 1(a0)
; RV32I-NEXT: lbu a2, 0(a0)
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: lbu a4, 3(a0)
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu a4, 2(a0)
; RV32I-NEXT: slli a1, a1, 8
; RV32I-NEXT: or a1, a1, a2
-; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a2, a4, a3
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a2, a3, 16
; RV32I-NEXT: or a2, a2, a1
; RV32I-NEXT: lbu a1, 5(a0)
; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: lbu a4, 6(a0)
-; RV32I-NEXT: lbu a0, 7(a0)
+; RV32I-NEXT: lbu a4, 7(a0)
+; RV32I-NEXT: lbu a0, 6(a0)
; RV32I-NEXT: slli a1, a1, 8
; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: slli a4, a4, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a1, a0, a1
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: ret
@@ -112,23 +126,23 @@ define i64 @load_i64(ptr %p) {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a1, 1(a0)
; RV64I-NEXT: lbu a2, 0(a0)
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: lbu a4, 3(a0)
+; RV64I-NEXT: lbu a3, 3(a0)
+; RV64I-NEXT: lbu a4, 2(a0)
; RV64I-NEXT: slli a1, a1, 8
; RV64I-NEXT: or a1, a1, a2
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a3, a3, 16
-; RV64I-NEXT: slli a4, a4, 24
-; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: or a1, a3, a1
; RV64I-NEXT: lbu a2, 5(a0)
; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: lbu a4, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu a4, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a2, a2, 8
; RV64I-NEXT: or a2, a2, a3
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a4
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index c016e8f3163635..fab45e57b4f900 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -446,7 +446,8 @@ define void @test_urem_vec(ptr %X) nounwind {
; RV32M-NEXT: andi a2, a2, 2047
; RV32M-NEXT: li a4, 683
; RV32M-NEXT: mul a2, a2, a4
-; RV32M-NEXT: slli a4, a2, 10
+; RV32M-NEXT: slli a4, a2, 1
+; RV32M-NEXT: slli a4, a4, 9
; RV32M-NEXT: slli a2, a2, 21
; RV32M-NEXT: srli a2, a2, 22
; RV32M-NEXT: or a2, a2, a4
@@ -488,7 +489,8 @@ define void @test_urem_vec(ptr %X) nounwind {
; RV64M-NEXT: andi a1, a1, 2047
; RV64M-NEXT: li a4, 683
; RV64M-NEXT: mul a1, a1, a4
-; RV64M-NEXT: slli a4, a1, 10
+; RV64M-NEXT: slli a4, a1, 1
+; RV64M-NEXT: slli a4, a4, 9
; RV64M-NEXT: slli a1, a1, 53
; RV64M-NEXT: srli a1, a1, 54
; RV64M-NEXT: or a1, a1, a4
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92bd..f16f90e6e255ec 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -7,37 +7,37 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lb a5, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: lb a0, 3(a0)
+; RV64I-NEXT: lbu a0, 2(a0)
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
-; RV64I-NEXT: srli a1, a0, 16
-; RV64I-NEXT: sb a1, 2(a2)
; RV64I-NEXT: srli a1, a0, 24
; RV64I-NEXT: sb a1, 3(a2)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 1(a2)
+; RV64I-NEXT: srli a1, a0, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a0, a0, 16
+; RV64I-NEXT: sb a0, 2(a2)
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_4bytes:
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 1(a0)
; RV32I-NEXT: lbu a4, 0(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: lbu a5, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a0, a5, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
@@ -45,11 +45,11 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: slli a5, a5, 19
+; RV32I-NEXT: slli a1, a1, 27
; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: slli a3, a3, 3
; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: slli a1, a1, 3
; RV32I-NEXT: srl a0, a0, a1
; RV32I-NEXT: sb a0, 0(a2)
; RV32I-NEXT: srli a1, a0, 16
@@ -71,37 +71,37 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lb a5, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: lb a0, 3(a0)
+; RV64I-NEXT: lbu a0, 2(a0)
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: sllw a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
-; RV64I-NEXT: srli a1, a0, 16
-; RV64I-NEXT: sb a1, 2(a2)
; RV64I-NEXT: srli a1, a0, 24
; RV64I-NEXT: sb a1, 3(a2)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 1(a2)
+; RV64I-NEXT: srli a1, a0, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a0, a0, 16
+; RV64I-NEXT: sb a0, 2(a2)
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_4bytes:
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 1(a0)
; RV32I-NEXT: lbu a4, 0(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: lbu a5, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a0, a5, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
@@ -109,11 +109,11 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: slli a5, a5, 19
+; RV32I-NEXT: slli a1, a1, 27
; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: slli a3, a3, 3
; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: slli a1, a1, 3
; RV32I-NEXT: sll a0, a0, a1
; RV32I-NEXT: sb a0, 0(a2)
; RV32I-NEXT: srli a1, a0, 16
@@ -135,37 +135,37 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lb a5, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: lb a0, 3(a0)
+; RV64I-NEXT: lbu a0, 2(a0)
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: sraw a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
-; RV64I-NEXT: srli a1, a0, 16
-; RV64I-NEXT: sb a1, 2(a2)
; RV64I-NEXT: srli a1, a0, 24
; RV64I-NEXT: sb a1, 3(a2)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 1(a2)
+; RV64I-NEXT: srli a1, a0, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a0, a0, 16
+; RV64I-NEXT: sb a0, 2(a2)
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_4bytes:
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 1(a0)
; RV32I-NEXT: lbu a4, 0(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: lbu a5, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a0, a5, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
@@ -173,11 +173,11 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: slli a5, a5, 19
+; RV32I-NEXT: slli a1, a1, 27
; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: slli a3, a3, 3
; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: slli a1, a1, 3
; RV32I-NEXT: sra a0, a0, a1
; RV32I-NEXT: sb a0, 0(a2)
; RV32I-NEXT: srli a1, a0, 16
@@ -200,49 +200,49 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a5, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a6, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu a6, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: lbu a3, 5(a1)
-; RV64I-NEXT: lbu a4, 4(a1)
-; RV64I-NEXT: lbu a5, 6(a1)
-; RV64I-NEXT: lbu a6, 7(a1)
+; RV64I-NEXT: lbu a3, 1(a1)
+; RV64I-NEXT: lbu a4, 0(a1)
+; RV64I-NEXT: lbu a5, 3(a1)
+; RV64I-NEXT: lbu a6, 2(a1)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 1(a1)
-; RV64I-NEXT: lbu a5, 0(a1)
-; RV64I-NEXT: lbu a6, 2(a1)
-; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: lbu a4, 5(a1)
+; RV64I-NEXT: lbu a5, 4(a1)
+; RV64I-NEXT: lbu a6, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: slli a6, a6, 51
+; RV64I-NEXT: slli a1, a1, 59
; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a4, a4, 35
; RV64I-NEXT: or a1, a1, a4
-; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: slli a3, a3, 35
-; RV64I-NEXT: or a1, a3, a1
+; RV64I-NEXT: slli a3, a3, 3
+; RV64I-NEXT: or a1, a1, a3
; RV64I-NEXT: srl a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
; RV64I-NEXT: srli a1, a0, 48
@@ -265,13 +265,13 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 5(a0)
; RV32I-NEXT: lbu a4, 4(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 7(a0)
+; RV32I-NEXT: lbu a5, 7(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a4, a5, a6
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: lbu a4, 1(a1)
; RV32I-NEXT: lbu a5, 0(a1)
@@ -279,11 +279,11 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: slli a6, a6, 19
+; RV32I-NEXT: slli a1, a1, 27
; RV32I-NEXT: or a1, a1, a6
-; RV32I-NEXT: or a1, a1, a4
-; RV32I-NEXT: slli a5, a1, 3
+; RV32I-NEXT: slli a4, a4, 3
+; RV32I-NEXT: or a5, a1, a4
; RV32I-NEXT: addi a4, a5, -32
; RV32I-NEXT: srl a1, a3, a5
; RV32I-NEXT: bltz a4, .LBB3_2
@@ -293,17 +293,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: .LBB3_2:
; RV32I-NEXT: lbu a6, 1(a0)
; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 2(a0)
-; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: lbu t0, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a0, t0, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a0, a0, a6
; RV32I-NEXT: srl a0, a0, a5
-; RV32I-NEXT: slli a3, a3, 1
; RV32I-NEXT: not a5, a5
+; RV32I-NEXT: slli a3, a3, 1
; RV32I-NEXT: sll a3, a3, a5
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: .LBB3_3:
@@ -336,49 +336,49 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a5, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a6, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu a6, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: lbu a3, 5(a1)
-; RV64I-NEXT: lbu a4, 4(a1)
-; RV64I-NEXT: lbu a5, 6(a1)
-; RV64I-NEXT: lbu a6, 7(a1)
+; RV64I-NEXT: lbu a3, 1(a1)
+; RV64I-NEXT: lbu a4, 0(a1)
+; RV64I-NEXT: lbu a5, 3(a1)
+; RV64I-NEXT: lbu a6, 2(a1)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 1(a1)
-; RV64I-NEXT: lbu a5, 0(a1)
-; RV64I-NEXT: lbu a6, 2(a1)
-; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: lbu a4, 5(a1)
+; RV64I-NEXT: lbu a5, 4(a1)
+; RV64I-NEXT: lbu a6, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: slli a6, a6, 51
+; RV64I-NEXT: slli a1, a1, 59
; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a4, a4, 35
; RV64I-NEXT: or a1, a1, a4
-; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: slli a3, a3, 35
-; RV64I-NEXT: or a1, a3, a1
+; RV64I-NEXT: slli a3, a3, 3
+; RV64I-NEXT: or a1, a1, a3
; RV64I-NEXT: sll a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
; RV64I-NEXT: srli a1, a0, 48
@@ -401,13 +401,13 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 1(a0)
; RV32I-NEXT: lbu a4, 0(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a5, 3(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a4, a5, a6
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: lbu a4, 1(a1)
; RV32I-NEXT: lbu a5, 0(a1)
@@ -415,11 +415,11 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: slli a6, a6, 19
+; RV32I-NEXT: slli a1, a1, 27
; RV32I-NEXT: or a1, a1, a6
-; RV32I-NEXT: or a1, a1, a4
-; RV32I-NEXT: slli a5, a1, 3
+; RV32I-NEXT: slli a4, a4, 3
+; RV32I-NEXT: or a5, a1, a4
; RV32I-NEXT: addi a4, a5, -32
; RV32I-NEXT: sll a1, a3, a5
; RV32I-NEXT: bltz a4, .LBB4_2
@@ -429,17 +429,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: .LBB4_2:
; RV32I-NEXT: lbu a6, 5(a0)
; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 6(a0)
-; RV32I-NEXT: lbu a0, 7(a0)
+; RV32I-NEXT: lbu t0, 7(a0)
+; RV32I-NEXT: lbu a0, 6(a0)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a0, t0, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a0, a0, a6
; RV32I-NEXT: sll a0, a0, a5
-; RV32I-NEXT: srli a3, a3, 1
; RV32I-NEXT: not a5, a5
+; RV32I-NEXT: srli a3, a3, 1
; RV32I-NEXT: srl a3, a3, a5
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: .LBB4_3:
@@ -472,49 +472,49 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a5, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a6, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu a6, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: lbu a3, 5(a1)
-; RV64I-NEXT: lbu a4, 4(a1)
-; RV64I-NEXT: lbu a5, 6(a1)
-; RV64I-NEXT: lbu a6, 7(a1)
+; RV64I-NEXT: lbu a3, 1(a1)
+; RV64I-NEXT: lbu a4, 0(a1)
+; RV64I-NEXT: lbu a5, 3(a1)
+; RV64I-NEXT: lbu a6, 2(a1)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 1(a1)
-; RV64I-NEXT: lbu a5, 0(a1)
-; RV64I-NEXT: lbu a6, 2(a1)
-; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: lbu a4, 5(a1)
+; RV64I-NEXT: lbu a5, 4(a1)
+; RV64I-NEXT: lbu a6, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: slli a6, a6, 51
+; RV64I-NEXT: slli a1, a1, 59
; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a4, a4, 35
; RV64I-NEXT: or a1, a1, a4
-; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: slli a3, a3, 35
-; RV64I-NEXT: or a1, a3, a1
+; RV64I-NEXT: slli a3, a3, 3
+; RV64I-NEXT: or a1, a1, a3
; RV64I-NEXT: sra a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
; RV64I-NEXT: srli a1, a0, 48
@@ -537,47 +537,47 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 5(a0)
; RV32I-NEXT: lbu a4, 4(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 7(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a4, a6, 24
-; RV32I-NEXT: or a5, a4, a5
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: lbu a5, 1(a1)
-; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a4, 7(a0)
+; RV32I-NEXT: lbu a5, 6(a0)
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: lbu a7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: slli a5, a4, 16
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: slli a7, a7, 19
+; RV32I-NEXT: slli a1, a1, 27
; RV32I-NEXT: or a1, a1, a7
-; RV32I-NEXT: or a1, a1, a5
-; RV32I-NEXT: slli a5, a1, 3
-; RV32I-NEXT: addi a6, a5, -32
-; RV32I-NEXT: sra a1, a3, a5
+; RV32I-NEXT: slli a4, a6, 3
+; RV32I-NEXT: or a4, a1, a4
+; RV32I-NEXT: addi a6, a4, -32
+; RV32I-NEXT: sra a1, a3, a4
; RV32I-NEXT: bltz a6, .LBB5_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: srai a4, a4, 31
+; RV32I-NEXT: srai a5, a5, 31
; RV32I-NEXT: mv a0, a1
-; RV32I-NEXT: mv a1, a4
+; RV32I-NEXT: mv a1, a5
; RV32I-NEXT: j .LBB5_3
; RV32I-NEXT: .LBB5_2:
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 1(a0)
; RV32I-NEXT: lbu a6, 0(a0)
-; RV32I-NEXT: lbu a7, 2(a0)
-; RV32I-NEXT: lbu a0, 3(a0)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a7
-; RV32I-NEXT: or a0, a0, a4
-; RV32I-NEXT: srl a0, a0, a5
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a0, a7, a0
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: srl a0, a0, a4
+; RV32I-NEXT: not a4, a4
; RV32I-NEXT: slli a3, a3, 1
-; RV32I-NEXT: not a4, a5
; RV32I-NEXT: sll a3, a3, a4
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: .LBB5_3:
@@ -609,49 +609,49 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 9(a0)
; RV64I-NEXT: lbu a4, 8(a0)
-; RV64I-NEXT: lbu a5, 10(a0)
-; RV64I-NEXT: lbu a6, 11(a0)
+; RV64I-NEXT: lbu a5, 11(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 13(a0)
; RV64I-NEXT: lbu a5, 12(a0)
-; RV64I-NEXT: lbu a6, 14(a0)
-; RV64I-NEXT: lbu a7, 15(a0)
+; RV64I-NEXT: lbu a6, 15(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 5(a1)
-; RV64I-NEXT: lbu a5, 4(a1)
-; RV64I-NEXT: lbu a6, 6(a1)
-; RV64I-NEXT: lbu a7, 7(a1)
+; RV64I-NEXT: lbu a4, 1(a1)
+; RV64I-NEXT: lbu a5, 0(a1)
+; RV64I-NEXT: lbu a6, 3(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 1(a1)
-; RV64I-NEXT: lbu a6, 0(a1)
-; RV64I-NEXT: lbu a7, 2(a1)
-; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: lbu a5, 5(a1)
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu a7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: slli a7, a7, 51
+; RV64I-NEXT: slli a1, a1, 59
; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a5, a5, 35
; RV64I-NEXT: or a1, a1, a5
-; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: slli a4, a4, 35
-; RV64I-NEXT: or a5, a4, a1
+; RV64I-NEXT: slli a4, a4, 3
+; RV64I-NEXT: or a5, a1, a4
; RV64I-NEXT: addi a4, a5, -64
; RV64I-NEXT: srl a1, a3, a5
; RV64I-NEXT: bltz a4, .LBB6_2
@@ -661,23 +661,23 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: .LBB6_2:
; RV64I-NEXT: lbu a6, 1(a0)
; RV64I-NEXT: lbu a7, 0(a0)
-; RV64I-NEXT: lbu t0, 2(a0)
-; RV64I-NEXT: lbu t1, 3(a0)
+; RV64I-NEXT: lbu t0, 3(a0)
+; RV64I-NEXT: lbu t1, 2(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a7, t0, t1
+; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 5(a0)
; RV64I-NEXT: lbu t0, 4(a0)
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu t1, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or a0, t1, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a7
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a6
@@ -828,49 +828,49 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a5, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a6, 6(a0)
-; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: lbu a6, 7(a0)
+; RV64I-NEXT: lbu a7, 6(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 5(a1)
-; RV64I-NEXT: lbu a5, 4(a1)
-; RV64I-NEXT: lbu a6, 6(a1)
-; RV64I-NEXT: lbu a7, 7(a1)
+; RV64I-NEXT: lbu a4, 1(a1)
+; RV64I-NEXT: lbu a5, 0(a1)
+; RV64I-NEXT: lbu a6, 3(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 1(a1)
-; RV64I-NEXT: lbu a6, 0(a1)
-; RV64I-NEXT: lbu a7, 2(a1)
-; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: lbu a5, 5(a1)
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu a7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: slli a7, a7, 51
+; RV64I-NEXT: slli a1, a1, 59
; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a5, a5, 35
; RV64I-NEXT: or a1, a1, a5
-; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: slli a4, a4, 35
-; RV64I-NEXT: or a5, a4, a1
+; RV64I-NEXT: slli a4, a4, 3
+; RV64I-NEXT: or a5, a1, a4
; RV64I-NEXT: addi a4, a5, -64
; RV64I-NEXT: sll a1, a3, a5
; RV64I-NEXT: bltz a4, .LBB7_2
@@ -880,23 +880,23 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: .LBB7_2:
; RV64I-NEXT: lbu a6, 9(a0)
; RV64I-NEXT: lbu a7, 8(a0)
-; RV64I-NEXT: lbu t0, 10(a0)
-; RV64I-NEXT: lbu t1, 11(a0)
+; RV64I-NEXT: lbu t0, 11(a0)
+; RV64I-NEXT: lbu t1, 10(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a7, t0, t1
+; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 13(a0)
; RV64I-NEXT: lbu t0, 12(a0)
-; RV64I-NEXT: lbu t1, 14(a0)
-; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: lbu t1, 15(a0)
+; RV64I-NEXT: lbu a0, 14(a0)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or a0, t1, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a7
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a6
@@ -1047,82 +1047,82 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 9(a0)
; RV64I-NEXT: lbu a4, 8(a0)
-; RV64I-NEXT: lbu a5, 10(a0)
-; RV64I-NEXT: lbu a6, 11(a0)
+; RV64I-NEXT: lbu a5, 11(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 13(a0)
; RV64I-NEXT: lbu a5, 12(a0)
-; RV64I-NEXT: lbu a6, 14(a0)
-; RV64I-NEXT: lbu a7, 15(a0)
+; RV64I-NEXT: lbu a6, 15(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 1(a1)
+; RV64I-NEXT: lbu a5, 0(a1)
+; RV64I-NEXT: lbu a6, 3(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: slli a5, a4, 32
-; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: lbu a5, 5(a1)
; RV64I-NEXT: lbu a6, 4(a1)
; RV64I-NEXT: lbu a7, 6(a1)
-; RV64I-NEXT: lbu t0, 7(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a6, t0, a7
-; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 1(a1)
-; RV64I-NEXT: lbu a7, 0(a1)
-; RV64I-NEXT: lbu t0, 2(a1)
-; RV64I-NEXT: lbu a1, 3(a1)
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, t0
-; RV64I-NEXT: or a1, a1, a6
-; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: slli a7, a7, 51
+; RV64I-NEXT: slli a1, a1, 59
+; RV64I-NEXT: or a1, a1, a7
; RV64I-NEXT: slli a5, a5, 35
-; RV64I-NEXT: or a5, a5, a1
-; RV64I-NEXT: addi a6, a5, -64
-; RV64I-NEXT: sra a1, a3, a5
-; RV64I-NEXT: bltz a6, .LBB8_2
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a4, a4, 3
+; RV64I-NEXT: or a4, a1, a4
+; RV64I-NEXT: addi a5, a4, -64
+; RV64I-NEXT: sra a1, a3, a4
+; RV64I-NEXT: bltz a5, .LBB8_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sraiw a3, a4, 31
+; RV64I-NEXT: srai a3, a3, 63
; RV64I-NEXT: mv a0, a1
; RV64I-NEXT: mv a1, a3
; RV64I-NEXT: j .LBB8_3
; RV64I-NEXT: .LBB8_2:
-; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 1(a0)
; RV64I-NEXT: lbu a6, 0(a0)
-; RV64I-NEXT: lbu a7, 2(a0)
-; RV64I-NEXT: lbu t0, 3(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: or a4, a4, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a6, t0, a7
-; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu t0, 2(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a6, a7, t0
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: lbu a6, 5(a0)
; RV64I-NEXT: lbu a7, 4(a0)
-; RV64I-NEXT: lbu t0, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu t0, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a0, t0, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: slli a0, a0, 32
-; RV64I-NEXT: or a0, a0, a4
-; RV64I-NEXT: srl a0, a0, a5
-; RV64I-NEXT: not a4, a5
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: srl a0, a0, a4
+; RV64I-NEXT: not a4, a4
; RV64I-NEXT: slli a3, a3, 1
; RV64I-NEXT: sll a3, a3, a4
; RV64I-NEXT: or a0, a0, a3
@@ -1167,7 +1167,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 15(a0)
-; RV32I-NEXT: slli a4, a3, 24
+; RV32I-NEXT: slli a4, a3, 8
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: lbu a5, 0(a0)
; RV32I-NEXT: lbu a6, 1(a0)
; RV32I-NEXT: lbu a7, 2(a0)
@@ -1272,220 +1273,338 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: lshr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a5, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
+; RV64I-NEXT: slli a3, a5, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli t0, a7, 8
+; RV64I-NEXT: or t0, t0, a6
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 4(a0)
; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: lbu ra, 24(a0)
-; RV64I-NEXT: lbu t0, 25(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu a6, 27(a0)
-; RV64I-NEXT: lbu a5, 28(a0)
-; RV64I-NEXT: lbu a3, 31(a0)
-; RV64I-NEXT: lbu a4, 30(a0)
-; RV64I-NEXT: lbu a0, 29(a0)
-; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: sb a3, 87(sp)
-; RV64I-NEXT: sb a4, 86(sp)
-; RV64I-NEXT: sb a0, 85(sp)
-; RV64I-NEXT: sb a5, 84(sp)
-; RV64I-NEXT: sb a6, 83(sp)
-; RV64I-NEXT: sb a7, 82(sp)
-; RV64I-NEXT: sb zero, 119(sp)
-; RV64I-NEXT: sb zero, 118(sp)
-; RV64I-NEXT: sb zero, 117(sp)
-; RV64I-NEXT: sb zero, 116(sp)
-; RV64I-NEXT: sb zero, 115(sp)
-; RV64I-NEXT: sb zero, 114(sp)
-; RV64I-NEXT: sb zero, 113(sp)
-; RV64I-NEXT: sb zero, 112(sp)
-; RV64I-NEXT: sb zero, 111(sp)
-; RV64I-NEXT: sb zero, 110(sp)
-; RV64I-NEXT: sb zero, 109(sp)
-; RV64I-NEXT: sb zero, 108(sp)
-; RV64I-NEXT: sb zero, 107(sp)
-; RV64I-NEXT: sb zero, 106(sp)
-; RV64I-NEXT: sb zero, 105(sp)
-; RV64I-NEXT: sb zero, 104(sp)
-; RV64I-NEXT: sb zero, 103(sp)
-; RV64I-NEXT: sb zero, 102(sp)
-; RV64I-NEXT: sb zero, 101(sp)
-; RV64I-NEXT: sb zero, 100(sp)
-; RV64I-NEXT: sb zero, 99(sp)
-; RV64I-NEXT: sb zero, 98(sp)
-; RV64I-NEXT: sb zero, 97(sp)
-; RV64I-NEXT: sb zero, 96(sp)
-; RV64I-NEXT: sb zero, 95(sp)
-; RV64I-NEXT: sb zero, 94(sp)
-; RV64I-NEXT: sb zero, 93(sp)
-; RV64I-NEXT: sb zero, 92(sp)
-; RV64I-NEXT: sb zero, 91(sp)
-; RV64I-NEXT: sb zero, 90(sp)
-; RV64I-NEXT: sb zero, 89(sp)
-; RV64I-NEXT: sb zero, 88(sp)
-; RV64I-NEXT: sb t0, 81(sp)
-; RV64I-NEXT: sb ra, 80(sp)
-; RV64I-NEXT: sb s11, 79(sp)
-; RV64I-NEXT: sb s10, 78(sp)
-; RV64I-NEXT: sb s9, 77(sp)
-; RV64I-NEXT: sb s8, 76(sp)
-; RV64I-NEXT: sb s7, 75(sp)
-; RV64I-NEXT: sb s6, 74(sp)
-; RV64I-NEXT: sb s5, 73(sp)
-; RV64I-NEXT: sb s4, 72(sp)
-; RV64I-NEXT: sb s3, 71(sp)
-; RV64I-NEXT: sb s2, 70(sp)
-; RV64I-NEXT: sb s1, 69(sp)
-; RV64I-NEXT: sb s0, 68(sp)
-; RV64I-NEXT: sb t6, 67(sp)
-; RV64I-NEXT: sb t5, 66(sp)
-; RV64I-NEXT: sb t4, 65(sp)
-; RV64I-NEXT: sb t3, 64(sp)
-; RV64I-NEXT: sb t2, 63(sp)
-; RV64I-NEXT: sb t1, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: andi a1, a1, 31
-; RV64I-NEXT: addi a0, sp, 56
-; RV64I-NEXT: add a6, a0, a1
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu t3, 6(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t1, t2, t3
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: slli t0, t0, 32
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t1, 9(a0)
+; RV64I-NEXT: lbu t0, 8(a0)
+; RV64I-NEXT: lbu t4, 11(a0)
+; RV64I-NEXT: lbu t3, 10(a0)
+; RV64I-NEXT: slli t2, t1, 8
+; RV64I-NEXT: or t2, t2, t0
+; RV64I-NEXT: slli t5, t4, 8
+; RV64I-NEXT: or t5, t5, t3
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or t2, t5, t2
+; RV64I-NEXT: lbu t5, 13(a0)
+; RV64I-NEXT: lbu t6, 12(a0)
+; RV64I-NEXT: lbu s0, 15(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: or t5, t5, t6
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: or s0, s0, s1
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t5, s0, t5
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: or t2, t5, t2
+; RV64I-NEXT: lbu t6, 17(a0)
+; RV64I-NEXT: lbu t5, 16(a0)
+; RV64I-NEXT: lbu s2, 19(a0)
+; RV64I-NEXT: lbu s1, 18(a0)
+; RV64I-NEXT: slli s0, t6, 8
+; RV64I-NEXT: or s0, s0, t5
+; RV64I-NEXT: slli s3, s2, 8
+; RV64I-NEXT: or s3, s3, s1
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or s0, s3, s0
+; RV64I-NEXT: lbu s3, 21(a0)
+; RV64I-NEXT: lbu s4, 20(a0)
+; RV64I-NEXT: lbu s5, 23(a0)
+; RV64I-NEXT: lbu s6, 22(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: or s3, s3, s4
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s4, s5, s6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s3, s4, s3
+; RV64I-NEXT: slli s3, s3, 32
+; RV64I-NEXT: or s0, s3, s0
+; RV64I-NEXT: lbu s4, 25(a0)
+; RV64I-NEXT: lbu s3, 24(a0)
+; RV64I-NEXT: lbu s6, 27(a0)
+; RV64I-NEXT: lbu s5, 26(a0)
+; RV64I-NEXT: slli s7, s4, 8
+; RV64I-NEXT: or s7, s7, s3
+; RV64I-NEXT: slli s8, s6, 8
+; RV64I-NEXT: or s8, s8, s5
+; RV64I-NEXT: slli s8, s8, 16
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: lbu s8, 29(a0)
+; RV64I-NEXT: lbu s9, 28(a0)
+; RV64I-NEXT: lbu s10, 31(a0)
+; RV64I-NEXT: lbu a0, 30(a0)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s8, s8, s9
+; RV64I-NEXT: slli s10, s10, 8
+; RV64I-NEXT: or a0, s10, a0
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, s8
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or s7, a0, s7
+; RV64I-NEXT: lbu a0, 0(a1)
+; RV64I-NEXT: sb zero, 71(sp)
+; RV64I-NEXT: sb zero, 70(sp)
+; RV64I-NEXT: sb zero, 69(sp)
+; RV64I-NEXT: sb zero, 68(sp)
+; RV64I-NEXT: sb zero, 67(sp)
+; RV64I-NEXT: sb zero, 66(sp)
+; RV64I-NEXT: sb zero, 65(sp)
+; RV64I-NEXT: sb zero, 64(sp)
+; RV64I-NEXT: sb zero, 63(sp)
+; RV64I-NEXT: sb zero, 62(sp)
+; RV64I-NEXT: sb zero, 61(sp)
+; RV64I-NEXT: sb zero, 60(sp)
+; RV64I-NEXT: sb zero, 59(sp)
+; RV64I-NEXT: sb zero, 58(sp)
+; RV64I-NEXT: sb zero, 57(sp)
+; RV64I-NEXT: sb zero, 56(sp)
+; RV64I-NEXT: sb zero, 55(sp)
+; RV64I-NEXT: sb zero, 54(sp)
+; RV64I-NEXT: sb zero, 53(sp)
+; RV64I-NEXT: sb zero, 52(sp)
+; RV64I-NEXT: sb zero, 51(sp)
+; RV64I-NEXT: sb zero, 50(sp)
+; RV64I-NEXT: sb zero, 49(sp)
+; RV64I-NEXT: sb zero, 48(sp)
+; RV64I-NEXT: sb zero, 47(sp)
+; RV64I-NEXT: sb zero, 46(sp)
+; RV64I-NEXT: sb zero, 45(sp)
+; RV64I-NEXT: sb zero, 44(sp)
+; RV64I-NEXT: sb zero, 43(sp)
+; RV64I-NEXT: sb zero, 42(sp)
+; RV64I-NEXT: sb zero, 41(sp)
+; RV64I-NEXT: sb zero, 40(sp)
+; RV64I-NEXT: sb s6, 35(sp)
+; RV64I-NEXT: sb s5, 34(sp)
+; RV64I-NEXT: sb s4, 33(sp)
+; RV64I-NEXT: sb s3, 32(sp)
+; RV64I-NEXT: sb s2, 27(sp)
+; RV64I-NEXT: sb s1, 26(sp)
+; RV64I-NEXT: sb t6, 25(sp)
+; RV64I-NEXT: sb t5, 24(sp)
+; RV64I-NEXT: sb t4, 19(sp)
+; RV64I-NEXT: sb t3, 18(sp)
+; RV64I-NEXT: sb t1, 17(sp)
+; RV64I-NEXT: sb t0, 16(sp)
+; RV64I-NEXT: sb a7, 11(sp)
+; RV64I-NEXT: sb a6, 10(sp)
+; RV64I-NEXT: sb a5, 9(sp)
+; RV64I-NEXT: sb a4, 8(sp)
+; RV64I-NEXT: srli a1, s7, 56
+; RV64I-NEXT: sb a1, 39(sp)
+; RV64I-NEXT: srli a1, s7, 48
+; RV64I-NEXT: sb a1, 38(sp)
+; RV64I-NEXT: srli a1, s7, 40
+; RV64I-NEXT: sb a1, 37(sp)
+; RV64I-NEXT: srli a1, s7, 32
+; RV64I-NEXT: sb a1, 36(sp)
+; RV64I-NEXT: srli a1, s0, 56
+; RV64I-NEXT: sb a1, 31(sp)
+; RV64I-NEXT: srli a1, s0, 48
+; RV64I-NEXT: sb a1, 30(sp)
+; RV64I-NEXT: srli a1, s0, 40
+; RV64I-NEXT: sb a1, 29(sp)
+; RV64I-NEXT: srli s0, s0, 32
+; RV64I-NEXT: sb s0, 28(sp)
+; RV64I-NEXT: srli a1, t2, 56
+; RV64I-NEXT: sb a1, 23(sp)
+; RV64I-NEXT: srli a1, t2, 48
+; RV64I-NEXT: sb a1, 22(sp)
+; RV64I-NEXT: srli a1, t2, 40
+; RV64I-NEXT: sb a1, 21(sp)
+; RV64I-NEXT: srli a1, t2, 32
+; RV64I-NEXT: sb a1, 20(sp)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 15(sp)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 14(sp)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 13(sp)
+; RV64I-NEXT: srli a3, a3, 32
+; RV64I-NEXT: sb a3, 12(sp)
+; RV64I-NEXT: andi a0, a0, 31
+; RV64I-NEXT: addi a6, sp, 8
+; RV64I-NEXT: add a6, a6, a0
+; RV64I-NEXT: lbu a3, 9(a6)
+; RV64I-NEXT: lbu a1, 8(a6)
+; RV64I-NEXT: lbu a5, 11(a6)
+; RV64I-NEXT: lbu a4, 10(a6)
+; RV64I-NEXT: slli a0, a3, 8
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a7, a5, 8
+; RV64I-NEXT: or a7, a7, a4
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: or a0, a7, a0
; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
+; RV64I-NEXT: lbu t0, 12(a6)
; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
+; RV64I-NEXT: lbu t2, 14(a6)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or t0, t1, t2
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: or a0, a7, a0
+; RV64I-NEXT: lbu t0, 1(a6)
+; RV64I-NEXT: lbu a7, 0(a6)
+; RV64I-NEXT: lbu t3, 3(a6)
+; RV64I-NEXT: lbu t2, 2(a6)
+; RV64I-NEXT: slli t1, t0, 8
+; RV64I-NEXT: or t1, t1, a7
+; RV64I-NEXT: slli t4, t3, 8
+; RV64I-NEXT: or t4, t4, t2
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or t1, t4, t1
+; RV64I-NEXT: lbu t4, 5(a6)
+; RV64I-NEXT: lbu t5, 4(a6)
+; RV64I-NEXT: lbu t6, 7(a6)
+; RV64I-NEXT: lbu s0, 6(a6)
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t4, t4, t5
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, s0
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or t4, t5, t4
+; RV64I-NEXT: slli t4, t4, 32
+; RV64I-NEXT: or t1, t4, t1
+; RV64I-NEXT: lbu t5, 25(a6)
+; RV64I-NEXT: lbu t4, 24(a6)
+; RV64I-NEXT: lbu s1, 27(a6)
+; RV64I-NEXT: lbu s0, 26(a6)
+; RV64I-NEXT: slli t6, t5, 8
+; RV64I-NEXT: or t6, t6, t4
+; RV64I-NEXT: slli s2, s1, 8
+; RV64I-NEXT: or s2, s2, s0
+; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: or t6, s2, t6
+; RV64I-NEXT: lbu s2, 29(a6)
+; RV64I-NEXT: lbu s3, 28(a6)
+; RV64I-NEXT: lbu s4, 31(a6)
+; RV64I-NEXT: lbu s5, 30(a6)
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: or s2, s2, s3
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s3, s4, s5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: slli s2, s2, 32
+; RV64I-NEXT: or t6, s2, t6
+; RV64I-NEXT: lbu s2, 17(a6)
+; RV64I-NEXT: lbu s3, 16(a6)
+; RV64I-NEXT: lbu s4, 19(a6)
+; RV64I-NEXT: lbu s5, 18(a6)
+; RV64I-NEXT: slli s6, s2, 8
+; RV64I-NEXT: or s6, s6, s3
+; RV64I-NEXT: slli s7, s4, 8
+; RV64I-NEXT: or s7, s7, s5
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or s6, s7, s6
+; RV64I-NEXT: lbu s7, 21(a6)
+; RV64I-NEXT: lbu s8, 20(a6)
+; RV64I-NEXT: lbu s9, 23(a6)
+; RV64I-NEXT: lbu a6, 22(a6)
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or s7, s7, s8
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: or a6, s9, a6
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: or a6, a6, s7
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a6, a6, s6
+; RV64I-NEXT: sb s4, 19(a2)
+; RV64I-NEXT: sb s5, 18(a2)
+; RV64I-NEXT: sb s2, 17(a2)
+; RV64I-NEXT: sb s3, 16(a2)
+; RV64I-NEXT: sb s1, 27(a2)
+; RV64I-NEXT: sb s0, 26(a2)
+; RV64I-NEXT: sb t5, 25(a2)
+; RV64I-NEXT: sb t4, 24(a2)
+; RV64I-NEXT: sb t3, 3(a2)
+; RV64I-NEXT: sb t2, 2(a2)
+; RV64I-NEXT: sb t0, 1(a2)
+; RV64I-NEXT: sb a7, 0(a2)
+; RV64I-NEXT: sb a5, 11(a2)
+; RV64I-NEXT: sb a4, 10(a2)
+; RV64I-NEXT: sb a3, 9(a2)
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a1, a6, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a6, 48
; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: srli a1, a6, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a6, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, t6, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, t6, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, t6, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, t6, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, t1, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, t1, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, t1, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, t1, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_32bytes:
@@ -1715,220 +1834,338 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a5, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
+; RV64I-NEXT: slli a3, a5, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli t0, a7, 8
+; RV64I-NEXT: or t0, t0, a6
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 4(a0)
; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: lbu ra, 24(a0)
-; RV64I-NEXT: lbu t0, 25(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu a6, 27(a0)
-; RV64I-NEXT: lbu a5, 28(a0)
-; RV64I-NEXT: lbu a3, 31(a0)
-; RV64I-NEXT: lbu a4, 30(a0)
-; RV64I-NEXT: lbu a0, 29(a0)
-; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: sb a3, 119(sp)
-; RV64I-NEXT: sb a4, 118(sp)
-; RV64I-NEXT: sb a0, 117(sp)
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: sb a7, 114(sp)
-; RV64I-NEXT: sb zero, 87(sp)
-; RV64I-NEXT: sb zero, 86(sp)
-; RV64I-NEXT: sb zero, 85(sp)
-; RV64I-NEXT: sb zero, 84(sp)
-; RV64I-NEXT: sb zero, 83(sp)
-; RV64I-NEXT: sb zero, 82(sp)
-; RV64I-NEXT: sb zero, 81(sp)
-; RV64I-NEXT: sb zero, 80(sp)
-; RV64I-NEXT: sb zero, 79(sp)
-; RV64I-NEXT: sb zero, 78(sp)
-; RV64I-NEXT: sb zero, 77(sp)
-; RV64I-NEXT: sb zero, 76(sp)
-; RV64I-NEXT: sb zero, 75(sp)
-; RV64I-NEXT: sb zero, 74(sp)
-; RV64I-NEXT: sb zero, 73(sp)
-; RV64I-NEXT: sb zero, 72(sp)
-; RV64I-NEXT: sb zero, 71(sp)
-; RV64I-NEXT: sb zero, 70(sp)
-; RV64I-NEXT: sb zero, 69(sp)
-; RV64I-NEXT: sb zero, 68(sp)
-; RV64I-NEXT: sb zero, 67(sp)
-; RV64I-NEXT: sb zero, 66(sp)
-; RV64I-NEXT: sb zero, 65(sp)
-; RV64I-NEXT: sb zero, 64(sp)
-; RV64I-NEXT: sb zero, 63(sp)
-; RV64I-NEXT: sb zero, 62(sp)
-; RV64I-NEXT: sb zero, 61(sp)
-; RV64I-NEXT: sb zero, 60(sp)
-; RV64I-NEXT: sb zero, 59(sp)
-; RV64I-NEXT: sb zero, 58(sp)
-; RV64I-NEXT: sb zero, 57(sp)
-; RV64I-NEXT: sb zero, 56(sp)
-; RV64I-NEXT: sb t0, 113(sp)
-; RV64I-NEXT: sb ra, 112(sp)
-; RV64I-NEXT: sb s11, 111(sp)
-; RV64I-NEXT: sb s10, 110(sp)
-; RV64I-NEXT: sb s9, 109(sp)
-; RV64I-NEXT: sb s8, 108(sp)
-; RV64I-NEXT: sb s7, 107(sp)
-; RV64I-NEXT: sb s6, 106(sp)
-; RV64I-NEXT: sb s5, 105(sp)
-; RV64I-NEXT: sb s4, 104(sp)
-; RV64I-NEXT: sb s3, 103(sp)
-; RV64I-NEXT: sb s2, 102(sp)
-; RV64I-NEXT: sb s1, 101(sp)
-; RV64I-NEXT: sb s0, 100(sp)
-; RV64I-NEXT: sb t6, 99(sp)
-; RV64I-NEXT: sb t5, 98(sp)
-; RV64I-NEXT: sb t4, 97(sp)
-; RV64I-NEXT: sb t3, 96(sp)
-; RV64I-NEXT: sb t2, 95(sp)
-; RV64I-NEXT: sb t1, 94(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 93(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 92(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 91(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 90(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: andi a1, a1, 31
-; RV64I-NEXT: addi a0, sp, 88
-; RV64I-NEXT: sub a6, a0, a1
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu t3, 6(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t1, t2, t3
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: slli t0, t0, 32
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t1, 9(a0)
+; RV64I-NEXT: lbu t0, 8(a0)
+; RV64I-NEXT: lbu t4, 11(a0)
+; RV64I-NEXT: lbu t3, 10(a0)
+; RV64I-NEXT: slli t2, t1, 8
+; RV64I-NEXT: or t2, t2, t0
+; RV64I-NEXT: slli t5, t4, 8
+; RV64I-NEXT: or t5, t5, t3
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or t2, t5, t2
+; RV64I-NEXT: lbu t5, 13(a0)
+; RV64I-NEXT: lbu t6, 12(a0)
+; RV64I-NEXT: lbu s0, 15(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: or t5, t5, t6
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: or s0, s0, s1
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t5, s0, t5
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: or t2, t5, t2
+; RV64I-NEXT: lbu t6, 17(a0)
+; RV64I-NEXT: lbu t5, 16(a0)
+; RV64I-NEXT: lbu s2, 19(a0)
+; RV64I-NEXT: lbu s1, 18(a0)
+; RV64I-NEXT: slli s0, t6, 8
+; RV64I-NEXT: or s0, s0, t5
+; RV64I-NEXT: slli s3, s2, 8
+; RV64I-NEXT: or s3, s3, s1
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or s0, s3, s0
+; RV64I-NEXT: lbu s3, 21(a0)
+; RV64I-NEXT: lbu s4, 20(a0)
+; RV64I-NEXT: lbu s5, 23(a0)
+; RV64I-NEXT: lbu s6, 22(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: or s3, s3, s4
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s4, s5, s6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s3, s4, s3
+; RV64I-NEXT: slli s3, s3, 32
+; RV64I-NEXT: or s0, s3, s0
+; RV64I-NEXT: lbu s4, 25(a0)
+; RV64I-NEXT: lbu s3, 24(a0)
+; RV64I-NEXT: lbu s6, 27(a0)
+; RV64I-NEXT: lbu s5, 26(a0)
+; RV64I-NEXT: slli s7, s4, 8
+; RV64I-NEXT: or s7, s7, s3
+; RV64I-NEXT: slli s8, s6, 8
+; RV64I-NEXT: or s8, s8, s5
+; RV64I-NEXT: slli s8, s8, 16
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: lbu s8, 29(a0)
+; RV64I-NEXT: lbu s9, 28(a0)
+; RV64I-NEXT: lbu s10, 31(a0)
+; RV64I-NEXT: lbu a0, 30(a0)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s8, s8, s9
+; RV64I-NEXT: slli s10, s10, 8
+; RV64I-NEXT: or a0, s10, a0
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, s8
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or s7, a0, s7
+; RV64I-NEXT: lbu a0, 0(a1)
+; RV64I-NEXT: sb zero, 39(sp)
+; RV64I-NEXT: sb zero, 38(sp)
+; RV64I-NEXT: sb zero, 37(sp)
+; RV64I-NEXT: sb zero, 36(sp)
+; RV64I-NEXT: sb zero, 35(sp)
+; RV64I-NEXT: sb zero, 34(sp)
+; RV64I-NEXT: sb zero, 33(sp)
+; RV64I-NEXT: sb zero, 32(sp)
+; RV64I-NEXT: sb zero, 31(sp)
+; RV64I-NEXT: sb zero, 30(sp)
+; RV64I-NEXT: sb zero, 29(sp)
+; RV64I-NEXT: sb zero, 28(sp)
+; RV64I-NEXT: sb zero, 27(sp)
+; RV64I-NEXT: sb zero, 26(sp)
+; RV64I-NEXT: sb zero, 25(sp)
+; RV64I-NEXT: sb zero, 24(sp)
+; RV64I-NEXT: sb zero, 23(sp)
+; RV64I-NEXT: sb zero, 22(sp)
+; RV64I-NEXT: sb zero, 21(sp)
+; RV64I-NEXT: sb zero, 20(sp)
+; RV64I-NEXT: sb zero, 19(sp)
+; RV64I-NEXT: sb zero, 18(sp)
+; RV64I-NEXT: sb zero, 17(sp)
+; RV64I-NEXT: sb zero, 16(sp)
+; RV64I-NEXT: sb zero, 15(sp)
+; RV64I-NEXT: sb zero, 14(sp)
+; RV64I-NEXT: sb zero, 13(sp)
+; RV64I-NEXT: sb zero, 12(sp)
+; RV64I-NEXT: sb zero, 11(sp)
+; RV64I-NEXT: sb zero, 10(sp)
+; RV64I-NEXT: sb zero, 9(sp)
+; RV64I-NEXT: sb zero, 8(sp)
+; RV64I-NEXT: sb s6, 67(sp)
+; RV64I-NEXT: sb s5, 66(sp)
+; RV64I-NEXT: sb s4, 65(sp)
+; RV64I-NEXT: sb s3, 64(sp)
+; RV64I-NEXT: sb s2, 59(sp)
+; RV64I-NEXT: sb s1, 58(sp)
+; RV64I-NEXT: sb t6, 57(sp)
+; RV64I-NEXT: sb t5, 56(sp)
+; RV64I-NEXT: sb t4, 51(sp)
+; RV64I-NEXT: sb t3, 50(sp)
+; RV64I-NEXT: sb t1, 49(sp)
+; RV64I-NEXT: sb t0, 48(sp)
+; RV64I-NEXT: sb a7, 43(sp)
+; RV64I-NEXT: sb a6, 42(sp)
+; RV64I-NEXT: sb a5, 41(sp)
+; RV64I-NEXT: sb a4, 40(sp)
+; RV64I-NEXT: srli a1, s7, 56
+; RV64I-NEXT: sb a1, 71(sp)
+; RV64I-NEXT: srli a1, s7, 48
+; RV64I-NEXT: sb a1, 70(sp)
+; RV64I-NEXT: srli a1, s7, 40
+; RV64I-NEXT: sb a1, 69(sp)
+; RV64I-NEXT: srli a1, s7, 32
+; RV64I-NEXT: sb a1, 68(sp)
+; RV64I-NEXT: srli a1, s0, 56
+; RV64I-NEXT: sb a1, 63(sp)
+; RV64I-NEXT: srli a1, s0, 48
+; RV64I-NEXT: sb a1, 62(sp)
+; RV64I-NEXT: srli a1, s0, 40
+; RV64I-NEXT: sb a1, 61(sp)
+; RV64I-NEXT: srli s0, s0, 32
+; RV64I-NEXT: sb s0, 60(sp)
+; RV64I-NEXT: srli a1, t2, 56
+; RV64I-NEXT: sb a1, 55(sp)
+; RV64I-NEXT: srli a1, t2, 48
+; RV64I-NEXT: sb a1, 54(sp)
+; RV64I-NEXT: srli a1, t2, 40
+; RV64I-NEXT: sb a1, 53(sp)
+; RV64I-NEXT: srli a1, t2, 32
+; RV64I-NEXT: sb a1, 52(sp)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 47(sp)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 46(sp)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 45(sp)
+; RV64I-NEXT: srli a3, a3, 32
+; RV64I-NEXT: sb a3, 44(sp)
+; RV64I-NEXT: andi a0, a0, 31
+; RV64I-NEXT: addi a1, sp, 40
+; RV64I-NEXT: sub a6, a1, a0
+; RV64I-NEXT: lbu a3, 9(a6)
+; RV64I-NEXT: lbu a1, 8(a6)
+; RV64I-NEXT: lbu a5, 11(a6)
+; RV64I-NEXT: lbu a4, 10(a6)
+; RV64I-NEXT: slli a0, a3, 8
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a7, a5, 8
+; RV64I-NEXT: or a7, a7, a4
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: or a0, a7, a0
; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
+; RV64I-NEXT: lbu t0, 12(a6)
; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
+; RV64I-NEXT: lbu t2, 14(a6)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or t0, t1, t2
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: or a0, a7, a0
+; RV64I-NEXT: lbu t0, 1(a6)
+; RV64I-NEXT: lbu a7, 0(a6)
+; RV64I-NEXT: lbu t3, 3(a6)
+; RV64I-NEXT: lbu t2, 2(a6)
+; RV64I-NEXT: slli t1, t0, 8
+; RV64I-NEXT: or t1, t1, a7
+; RV64I-NEXT: slli t4, t3, 8
+; RV64I-NEXT: or t4, t4, t2
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or t1, t4, t1
+; RV64I-NEXT: lbu t4, 5(a6)
+; RV64I-NEXT: lbu t5, 4(a6)
+; RV64I-NEXT: lbu t6, 7(a6)
+; RV64I-NEXT: lbu s0, 6(a6)
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t4, t4, t5
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, s0
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or t4, t5, t4
+; RV64I-NEXT: slli t4, t4, 32
+; RV64I-NEXT: or t1, t4, t1
+; RV64I-NEXT: lbu t5, 25(a6)
+; RV64I-NEXT: lbu t4, 24(a6)
+; RV64I-NEXT: lbu s1, 27(a6)
+; RV64I-NEXT: lbu s0, 26(a6)
+; RV64I-NEXT: slli t6, t5, 8
+; RV64I-NEXT: or t6, t6, t4
+; RV64I-NEXT: slli s2, s1, 8
+; RV64I-NEXT: or s2, s2, s0
+; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: or t6, s2, t6
+; RV64I-NEXT: lbu s2, 29(a6)
+; RV64I-NEXT: lbu s3, 28(a6)
+; RV64I-NEXT: lbu s4, 31(a6)
+; RV64I-NEXT: lbu s5, 30(a6)
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: or s2, s2, s3
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s3, s4, s5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: slli s2, s2, 32
+; RV64I-NEXT: or t6, s2, t6
+; RV64I-NEXT: lbu s2, 17(a6)
+; RV64I-NEXT: lbu s3, 16(a6)
+; RV64I-NEXT: lbu s4, 19(a6)
+; RV64I-NEXT: lbu s5, 18(a6)
+; RV64I-NEXT: slli s6, s2, 8
+; RV64I-NEXT: or s6, s6, s3
+; RV64I-NEXT: slli s7, s4, 8
+; RV64I-NEXT: or s7, s7, s5
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or s6, s7, s6
+; RV64I-NEXT: lbu s7, 21(a6)
+; RV64I-NEXT: lbu s8, 20(a6)
+; RV64I-NEXT: lbu s9, 23(a6)
+; RV64I-NEXT: lbu a6, 22(a6)
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or s7, s7, s8
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: or a6, s9, a6
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: or a6, a6, s7
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a6, a6, s6
+; RV64I-NEXT: sb s4, 19(a2)
+; RV64I-NEXT: sb s5, 18(a2)
+; RV64I-NEXT: sb s2, 17(a2)
+; RV64I-NEXT: sb s3, 16(a2)
+; RV64I-NEXT: sb s1, 27(a2)
+; RV64I-NEXT: sb s0, 26(a2)
+; RV64I-NEXT: sb t5, 25(a2)
+; RV64I-NEXT: sb t4, 24(a2)
+; RV64I-NEXT: sb t3, 3(a2)
+; RV64I-NEXT: sb t2, 2(a2)
+; RV64I-NEXT: sb t0, 1(a2)
+; RV64I-NEXT: sb a7, 0(a2)
+; RV64I-NEXT: sb a5, 11(a2)
+; RV64I-NEXT: sb a4, 10(a2)
+; RV64I-NEXT: sb a3, 9(a2)
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a1, a6, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a6, 48
; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: srli a1, a6, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a6, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, t6, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, t6, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, t6, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, t6, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, t1, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, t1, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, t1, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, t1, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_32bytes:
@@ -2158,230 +2395,346 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: mv t0, a1
-; RV64I-NEXT: lbu t1, 31(a0)
-; RV64I-NEXT: lbu a1, 0(a0)
-; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 1(a0)
-; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 2(a0)
-; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 3(a0)
-; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 4(a0)
-; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 5(a0)
-; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t2, 6(a0)
-; RV64I-NEXT: lbu t3, 7(a0)
-; RV64I-NEXT: lbu t4, 8(a0)
-; RV64I-NEXT: lbu t5, 9(a0)
-; RV64I-NEXT: lbu t6, 10(a0)
-; RV64I-NEXT: lbu s0, 11(a0)
-; RV64I-NEXT: lbu s1, 12(a0)
-; RV64I-NEXT: lbu s2, 13(a0)
-; RV64I-NEXT: lbu s3, 14(a0)
-; RV64I-NEXT: lbu s4, 15(a0)
-; RV64I-NEXT: lbu s5, 16(a0)
-; RV64I-NEXT: lbu s6, 17(a0)
-; RV64I-NEXT: lbu s7, 18(a0)
-; RV64I-NEXT: lbu s8, 19(a0)
-; RV64I-NEXT: lbu s9, 20(a0)
-; RV64I-NEXT: lbu s10, 21(a0)
-; RV64I-NEXT: lbu s11, 22(a0)
-; RV64I-NEXT: lbu ra, 23(a0)
-; RV64I-NEXT: lbu a7, 24(a0)
-; RV64I-NEXT: lbu a6, 25(a0)
-; RV64I-NEXT: lbu a5, 26(a0)
-; RV64I-NEXT: lbu a4, 27(a0)
-; RV64I-NEXT: lbu a1, 30(a0)
-; RV64I-NEXT: lbu a3, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: lbu t0, 0(t0)
-; RV64I-NEXT: sb a1, 86(sp)
-; RV64I-NEXT: sb a3, 85(sp)
-; RV64I-NEXT: sb a0, 84(sp)
-; RV64I-NEXT: sb a4, 83(sp)
-; RV64I-NEXT: sb a5, 82(sp)
-; RV64I-NEXT: sb a6, 81(sp)
-; RV64I-NEXT: sb t1, 87(sp)
-; RV64I-NEXT: slli t1, t1, 56
-; RV64I-NEXT: sb a7, 80(sp)
-; RV64I-NEXT: sb ra, 79(sp)
-; RV64I-NEXT: sb s11, 78(sp)
-; RV64I-NEXT: sb s10, 77(sp)
-; RV64I-NEXT: sb s9, 76(sp)
-; RV64I-NEXT: sb s8, 75(sp)
-; RV64I-NEXT: sb s7, 74(sp)
-; RV64I-NEXT: sb s6, 73(sp)
-; RV64I-NEXT: sb s5, 72(sp)
-; RV64I-NEXT: sb s4, 71(sp)
-; RV64I-NEXT: sb s3, 70(sp)
-; RV64I-NEXT: sb s2, 69(sp)
-; RV64I-NEXT: sb s1, 68(sp)
-; RV64I-NEXT: sb s0, 67(sp)
-; RV64I-NEXT: sb t6, 66(sp)
-; RV64I-NEXT: sb t5, 65(sp)
-; RV64I-NEXT: sb t4, 64(sp)
-; RV64I-NEXT: sb t3, 63(sp)
-; RV64I-NEXT: sb t2, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: srai a0, t1, 63
-; RV64I-NEXT: sb a0, 112(sp)
-; RV64I-NEXT: sb a0, 104(sp)
-; RV64I-NEXT: sb a0, 96(sp)
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: srli a1, a0, 56
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: srli a3, a0, 48
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: srli a4, a0, 40
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: srli a5, a0, 32
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: srli a6, a0, 24
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: srli a7, a0, 16
-; RV64I-NEXT: sb a7, 114(sp)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 113(sp)
-; RV64I-NEXT: sb a1, 111(sp)
-; RV64I-NEXT: sb a3, 110(sp)
-; RV64I-NEXT: sb a4, 109(sp)
-; RV64I-NEXT: sb a5, 108(sp)
-; RV64I-NEXT: sb a6, 107(sp)
-; RV64I-NEXT: sb a7, 106(sp)
-; RV64I-NEXT: sb a0, 105(sp)
-; RV64I-NEXT: sb a1, 103(sp)
-; RV64I-NEXT: sb a3, 102(sp)
-; RV64I-NEXT: sb a4, 101(sp)
-; RV64I-NEXT: sb a5, 100(sp)
-; RV64I-NEXT: sb a6, 99(sp)
-; RV64I-NEXT: sb a7, 98(sp)
-; RV64I-NEXT: sb a0, 97(sp)
-; RV64I-NEXT: sb a1, 95(sp)
-; RV64I-NEXT: sb a3, 94(sp)
-; RV64I-NEXT: sb a4, 93(sp)
-; RV64I-NEXT: sb a5, 92(sp)
-; RV64I-NEXT: sb a6, 91(sp)
-; RV64I-NEXT: sb a7, 90(sp)
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: andi a0, t0, 31
-; RV64I-NEXT: addi a1, sp, 56
-; RV64I-NEXT: add a6, a1, a0
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a5, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
+; RV64I-NEXT: slli a3, a5, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli t0, a7, 8
+; RV64I-NEXT: or t0, t0, a6
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 4(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 6(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t1, t2, t3
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: slli t0, t0, 32
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t2, 9(a0)
+; RV64I-NEXT: lbu t1, 8(a0)
+; RV64I-NEXT: lbu t4, 11(a0)
+; RV64I-NEXT: lbu t3, 10(a0)
+; RV64I-NEXT: slli t0, t2, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t5, t4, 8
+; RV64I-NEXT: or t5, t5, t3
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or t0, t5, t0
+; RV64I-NEXT: lbu t5, 13(a0)
+; RV64I-NEXT: lbu t6, 12(a0)
+; RV64I-NEXT: lbu s0, 15(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: or t5, t5, t6
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: or s0, s0, s1
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t5, s0, t5
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: or t0, t5, t0
+; RV64I-NEXT: lbu s0, 17(a0)
+; RV64I-NEXT: lbu t6, 16(a0)
+; RV64I-NEXT: lbu s2, 19(a0)
+; RV64I-NEXT: lbu s1, 18(a0)
+; RV64I-NEXT: slli t5, s0, 8
+; RV64I-NEXT: or t5, t5, t6
+; RV64I-NEXT: slli s3, s2, 8
+; RV64I-NEXT: or s3, s3, s1
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, t5
+; RV64I-NEXT: lbu s3, 21(a0)
+; RV64I-NEXT: lbu s4, 20(a0)
+; RV64I-NEXT: lbu s5, 23(a0)
+; RV64I-NEXT: lbu s6, 22(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: or s3, s3, s4
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s4, s5, s6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s3, s4, s3
+; RV64I-NEXT: slli s3, s3, 32
+; RV64I-NEXT: or t5, s3, t5
+; RV64I-NEXT: lbu s3, 25(a0)
+; RV64I-NEXT: lbu s4, 24(a0)
+; RV64I-NEXT: lbu s5, 27(a0)
+; RV64I-NEXT: lbu s6, 26(a0)
+; RV64I-NEXT: slli s7, s3, 8
+; RV64I-NEXT: or s7, s7, s4
+; RV64I-NEXT: slli s8, s5, 8
+; RV64I-NEXT: or s8, s8, s6
+; RV64I-NEXT: slli s8, s8, 16
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: lbu s8, 29(a0)
+; RV64I-NEXT: lbu s9, 28(a0)
+; RV64I-NEXT: lbu s10, 31(a0)
+; RV64I-NEXT: lbu a0, 30(a0)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s8, s8, s9
+; RV64I-NEXT: slli s10, s10, 8
+; RV64I-NEXT: or a0, s10, a0
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, s8
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or s7, a0, s7
+; RV64I-NEXT: lbu a0, 0(a1)
+; RV64I-NEXT: sb s5, 35(sp)
+; RV64I-NEXT: sb s6, 34(sp)
+; RV64I-NEXT: sb s3, 33(sp)
+; RV64I-NEXT: sb s4, 32(sp)
+; RV64I-NEXT: sb s2, 27(sp)
+; RV64I-NEXT: sb s1, 26(sp)
+; RV64I-NEXT: sb s0, 25(sp)
+; RV64I-NEXT: sb t6, 24(sp)
+; RV64I-NEXT: sb t4, 19(sp)
+; RV64I-NEXT: sb t3, 18(sp)
+; RV64I-NEXT: sb t2, 17(sp)
+; RV64I-NEXT: sb t1, 16(sp)
+; RV64I-NEXT: sb a7, 11(sp)
+; RV64I-NEXT: sb a6, 10(sp)
+; RV64I-NEXT: sb a5, 9(sp)
+; RV64I-NEXT: sb a4, 8(sp)
+; RV64I-NEXT: srai a1, s7, 63
+; RV64I-NEXT: sb a1, 64(sp)
+; RV64I-NEXT: sb a1, 56(sp)
+; RV64I-NEXT: sb a1, 48(sp)
+; RV64I-NEXT: sb a1, 40(sp)
+; RV64I-NEXT: srli a4, s7, 56
+; RV64I-NEXT: sb a4, 39(sp)
+; RV64I-NEXT: srli a4, s7, 48
+; RV64I-NEXT: sb a4, 38(sp)
+; RV64I-NEXT: srli a4, s7, 40
+; RV64I-NEXT: sb a4, 37(sp)
+; RV64I-NEXT: srli a4, s7, 32
+; RV64I-NEXT: sb a4, 36(sp)
+; RV64I-NEXT: srli a4, t5, 56
+; RV64I-NEXT: sb a4, 31(sp)
+; RV64I-NEXT: srli a4, t5, 48
+; RV64I-NEXT: sb a4, 30(sp)
+; RV64I-NEXT: srli a4, t5, 40
+; RV64I-NEXT: sb a4, 29(sp)
+; RV64I-NEXT: srli a4, t5, 32
+; RV64I-NEXT: sb a4, 28(sp)
+; RV64I-NEXT: srli a4, t0, 56
+; RV64I-NEXT: sb a4, 23(sp)
+; RV64I-NEXT: srli a4, t0, 48
+; RV64I-NEXT: sb a4, 22(sp)
+; RV64I-NEXT: srli a4, t0, 40
+; RV64I-NEXT: sb a4, 21(sp)
+; RV64I-NEXT: srli a4, t0, 32
+; RV64I-NEXT: sb a4, 20(sp)
+; RV64I-NEXT: srli a4, a3, 56
+; RV64I-NEXT: sb a4, 15(sp)
+; RV64I-NEXT: srli a4, a3, 48
+; RV64I-NEXT: sb a4, 14(sp)
+; RV64I-NEXT: srli a4, a3, 40
+; RV64I-NEXT: sb a4, 13(sp)
+; RV64I-NEXT: srli a3, a3, 32
+; RV64I-NEXT: sb a3, 12(sp)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 71(sp)
+; RV64I-NEXT: srli a4, a1, 48
+; RV64I-NEXT: sb a4, 70(sp)
+; RV64I-NEXT: srli a5, a1, 40
+; RV64I-NEXT: sb a5, 69(sp)
+; RV64I-NEXT: srli a6, a1, 32
+; RV64I-NEXT: sb a6, 68(sp)
+; RV64I-NEXT: srli a7, a1, 24
+; RV64I-NEXT: sb a7, 67(sp)
+; RV64I-NEXT: srli t0, a1, 16
+; RV64I-NEXT: sb t0, 66(sp)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 65(sp)
+; RV64I-NEXT: sb a3, 63(sp)
+; RV64I-NEXT: sb a4, 62(sp)
+; RV64I-NEXT: sb a5, 61(sp)
+; RV64I-NEXT: sb a6, 60(sp)
+; RV64I-NEXT: sb a7, 59(sp)
+; RV64I-NEXT: sb t0, 58(sp)
+; RV64I-NEXT: sb a1, 57(sp)
+; RV64I-NEXT: sb a3, 55(sp)
+; RV64I-NEXT: sb a4, 54(sp)
+; RV64I-NEXT: sb a5, 53(sp)
+; RV64I-NEXT: sb a6, 52(sp)
+; RV64I-NEXT: sb a7, 51(sp)
+; RV64I-NEXT: sb t0, 50(sp)
+; RV64I-NEXT: sb a1, 49(sp)
+; RV64I-NEXT: sb a3, 47(sp)
+; RV64I-NEXT: sb a4, 46(sp)
+; RV64I-NEXT: sb a5, 45(sp)
+; RV64I-NEXT: sb a6, 44(sp)
+; RV64I-NEXT: sb a7, 43(sp)
+; RV64I-NEXT: sb t0, 42(sp)
+; RV64I-NEXT: sb a1, 41(sp)
+; RV64I-NEXT: andi a0, a0, 31
+; RV64I-NEXT: addi a6, sp, 8
+; RV64I-NEXT: add a6, a6, a0
+; RV64I-NEXT: lbu a3, 9(a6)
+; RV64I-NEXT: lbu a1, 8(a6)
+; RV64I-NEXT: lbu a5, 11(a6)
+; RV64I-NEXT: lbu a4, 10(a6)
+; RV64I-NEXT: slli a0, a3, 8
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a7, a5, 8
+; RV64I-NEXT: or a7, a7, a4
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: or a0, a7, a0
; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
+; RV64I-NEXT: lbu t0, 12(a6)
; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
+; RV64I-NEXT: lbu t2, 14(a6)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or t0, t1, t2
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: or a0, a7, a0
+; RV64I-NEXT: lbu t0, 1(a6)
+; RV64I-NEXT: lbu a7, 0(a6)
+; RV64I-NEXT: lbu t3, 3(a6)
+; RV64I-NEXT: lbu t2, 2(a6)
+; RV64I-NEXT: slli t1, t0, 8
+; RV64I-NEXT: or t1, t1, a7
+; RV64I-NEXT: slli t4, t3, 8
+; RV64I-NEXT: or t4, t4, t2
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or t1, t4, t1
+; RV64I-NEXT: lbu t4, 5(a6)
+; RV64I-NEXT: lbu t5, 4(a6)
+; RV64I-NEXT: lbu t6, 7(a6)
+; RV64I-NEXT: lbu s0, 6(a6)
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t4, t4, t5
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, s0
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or t4, t5, t4
+; RV64I-NEXT: slli t4, t4, 32
+; RV64I-NEXT: or t1, t4, t1
+; RV64I-NEXT: lbu t5, 25(a6)
+; RV64I-NEXT: lbu t4, 24(a6)
+; RV64I-NEXT: lbu s1, 27(a6)
+; RV64I-NEXT: lbu s0, 26(a6)
+; RV64I-NEXT: slli t6, t5, 8
+; RV64I-NEXT: or t6, t6, t4
+; RV64I-NEXT: slli s2, s1, 8
+; RV64I-NEXT: or s2, s2, s0
+; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: or t6, s2, t6
+; RV64I-NEXT: lbu s2, 29(a6)
+; RV64I-NEXT: lbu s3, 28(a6)
+; RV64I-NEXT: lbu s4, 31(a6)
+; RV64I-NEXT: lbu s5, 30(a6)
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: or s2, s2, s3
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s3, s4, s5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: slli s2, s2, 32
+; RV64I-NEXT: or t6, s2, t6
+; RV64I-NEXT: lbu s2, 17(a6)
+; RV64I-NEXT: lbu s3, 16(a6)
+; RV64I-NEXT: lbu s4, 19(a6)
+; RV64I-NEXT: lbu s5, 18(a6)
+; RV64I-NEXT: slli s6, s2, 8
+; RV64I-NEXT: or s6, s6, s3
+; RV64I-NEXT: slli s7, s4, 8
+; RV64I-NEXT: or s7, s7, s5
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or s6, s7, s6
+; RV64I-NEXT: lbu s7, 21(a6)
+; RV64I-NEXT: lbu s8, 20(a6)
+; RV64I-NEXT: lbu s9, 23(a6)
+; RV64I-NEXT: lbu a6, 22(a6)
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or s7, s7, s8
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: or a6, s9, a6
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: or a6, a6, s7
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a6, a6, s6
+; RV64I-NEXT: sb s4, 19(a2)
+; RV64I-NEXT: sb s5, 18(a2)
+; RV64I-NEXT: sb s2, 17(a2)
+; RV64I-NEXT: sb s3, 16(a2)
+; RV64I-NEXT: sb s1, 27(a2)
+; RV64I-NEXT: sb s0, 26(a2)
+; RV64I-NEXT: sb t5, 25(a2)
+; RV64I-NEXT: sb t4, 24(a2)
+; RV64I-NEXT: sb t3, 3(a2)
+; RV64I-NEXT: sb t2, 2(a2)
+; RV64I-NEXT: sb t0, 1(a2)
+; RV64I-NEXT: sb a7, 0(a2)
+; RV64I-NEXT: sb a5, 11(a2)
+; RV64I-NEXT: sb a4, 10(a2)
+; RV64I-NEXT: sb a3, 9(a2)
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a1, a6, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a6, 48
; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: srli a1, a6, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a6, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, t6, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, t6, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, t6, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, t6, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, t1, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, t1, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, t1, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, t1, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_32bytes:
@@ -2447,7 +2800,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb a5, 54(sp)
; RV32I-NEXT: sb a6, 53(sp)
; RV32I-NEXT: sb t1, 59(sp)
-; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: sb a7, 52(sp)
; RV32I-NEXT: sb ra, 51(sp)
; RV32I-NEXT: sb s11, 50(sp)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2afaa..1976f4bef8cdd6 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -7,46 +7,46 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lb a5, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: lb a0, 3(a0)
+; RV64I-NEXT: lbu a0, 2(a0)
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
-; RV64I-NEXT: srli a1, a0, 16
-; RV64I-NEXT: sb a1, 2(a2)
; RV64I-NEXT: srli a1, a0, 24
; RV64I-NEXT: sb a1, 3(a2)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 1(a2)
+; RV64I-NEXT: srli a1, a0, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a0, a0, 16
+; RV64I-NEXT: sb a0, 2(a2)
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_4bytes:
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 1(a0)
; RV32I-NEXT: lbu a4, 0(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: lbu a5, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a0, a5, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu a5, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a1, a5, a1
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: srl a0, a0, a1
; RV32I-NEXT: sb a0, 0(a2)
@@ -68,46 +68,46 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lb a5, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: lb a0, 3(a0)
+; RV64I-NEXT: lbu a0, 2(a0)
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: sllw a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
-; RV64I-NEXT: srli a1, a0, 16
-; RV64I-NEXT: sb a1, 2(a2)
; RV64I-NEXT: srli a1, a0, 24
; RV64I-NEXT: sb a1, 3(a2)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 1(a2)
+; RV64I-NEXT: srli a1, a0, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a0, a0, 16
+; RV64I-NEXT: sb a0, 2(a2)
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_4bytes:
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 1(a0)
; RV32I-NEXT: lbu a4, 0(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: lbu a5, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a0, a5, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu a5, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a1, a5, a1
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: sll a0, a0, a1
; RV32I-NEXT: sb a0, 0(a2)
@@ -129,46 +129,46 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lb a5, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: lb a0, 3(a0)
+; RV64I-NEXT: lbu a0, 2(a0)
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: sraw a0, a0, a1
; RV64I-NEXT: sb a0, 0(a2)
-; RV64I-NEXT: srli a1, a0, 16
-; RV64I-NEXT: sb a1, 2(a2)
; RV64I-NEXT: srli a1, a0, 24
; RV64I-NEXT: sb a1, 3(a2)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 1(a2)
+; RV64I-NEXT: srli a1, a0, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a0, a0, 16
+; RV64I-NEXT: sb a0, 2(a2)
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_4bytes:
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 1(a0)
; RV32I-NEXT: lbu a4, 0(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: lbu a5, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a0, a5, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: lbu a3, 1(a1)
; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu a5, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a1, a5, a1
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: sra a0, a0, a1
; RV32I-NEXT: sb a0, 0(a2)
@@ -191,45 +191,45 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a5, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a6, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu a6, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: lbu a3, 1(a1)
; RV64I-NEXT: lbu a4, 0(a1)
-; RV64I-NEXT: lbu a5, 2(a1)
-; RV64I-NEXT: lbu a6, 3(a1)
+; RV64I-NEXT: lbu a5, 3(a1)
+; RV64I-NEXT: lbu a6, 2(a1)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a1)
; RV64I-NEXT: lbu a5, 4(a1)
-; RV64I-NEXT: lbu a6, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: lbu a6, 7(a1)
+; RV64I-NEXT: lbu a1, 6(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: slli a1, a1, 16
; RV64I-NEXT: or a1, a1, a4
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a1, a1, a3
@@ -255,24 +255,24 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 5(a0)
; RV32I-NEXT: lbu a4, 4(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 7(a0)
+; RV32I-NEXT: lbu a5, 7(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a4, a5, a6
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: lbu a4, 1(a1)
; RV32I-NEXT: lbu a5, 0(a1)
-; RV32I-NEXT: lbu a6, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu a6, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a5, a1, a6
-; RV32I-NEXT: or a5, a5, a4
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a5, a1, a4
; RV32I-NEXT: addi a4, a5, -32
; RV32I-NEXT: srl a1, a3, a5
; RV32I-NEXT: bltz a4, .LBB3_2
@@ -282,13 +282,13 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: .LBB3_2:
; RV32I-NEXT: lbu a6, 1(a0)
; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 2(a0)
-; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: lbu t0, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a0, t0, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a0, a0, a6
; RV32I-NEXT: srl a0, a0, a5
; RV32I-NEXT: not a5, a5
@@ -324,45 +324,45 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a5, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a6, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu a6, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: lbu a3, 1(a1)
; RV64I-NEXT: lbu a4, 0(a1)
-; RV64I-NEXT: lbu a5, 2(a1)
-; RV64I-NEXT: lbu a6, 3(a1)
+; RV64I-NEXT: lbu a5, 3(a1)
+; RV64I-NEXT: lbu a6, 2(a1)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a1)
; RV64I-NEXT: lbu a5, 4(a1)
-; RV64I-NEXT: lbu a6, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: lbu a6, 7(a1)
+; RV64I-NEXT: lbu a1, 6(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: slli a1, a1, 16
; RV64I-NEXT: or a1, a1, a4
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a1, a1, a3
@@ -388,24 +388,24 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 1(a0)
; RV32I-NEXT: lbu a4, 0(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a5, 3(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a4, a5, a6
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: lbu a4, 1(a1)
; RV32I-NEXT: lbu a5, 0(a1)
-; RV32I-NEXT: lbu a6, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu a6, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a5, a1, a6
-; RV32I-NEXT: or a5, a5, a4
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a5, a1, a4
; RV32I-NEXT: addi a4, a5, -32
; RV32I-NEXT: sll a1, a3, a5
; RV32I-NEXT: bltz a4, .LBB4_2
@@ -415,13 +415,13 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: .LBB4_2:
; RV32I-NEXT: lbu a6, 5(a0)
; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 6(a0)
-; RV32I-NEXT: lbu a0, 7(a0)
+; RV32I-NEXT: lbu t0, 7(a0)
+; RV32I-NEXT: lbu a0, 6(a0)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a0, t0, a0
+; RV32I-NEXT: slli a0, a0, 16
; RV32I-NEXT: or a0, a0, a6
; RV32I-NEXT: sll a0, a0, a5
; RV32I-NEXT: not a5, a5
@@ -457,45 +457,45 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a5, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a6, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu a6, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: lbu a3, 1(a1)
; RV64I-NEXT: lbu a4, 0(a1)
-; RV64I-NEXT: lbu a5, 2(a1)
-; RV64I-NEXT: lbu a6, 3(a1)
+; RV64I-NEXT: lbu a5, 3(a1)
+; RV64I-NEXT: lbu a6, 2(a1)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a1)
; RV64I-NEXT: lbu a5, 4(a1)
-; RV64I-NEXT: lbu a6, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: lbu a6, 7(a1)
+; RV64I-NEXT: lbu a1, 6(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: slli a1, a1, 16
; RV64I-NEXT: or a1, a1, a4
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a1, a1, a3
@@ -521,45 +521,45 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: lbu a3, 5(a0)
; RV32I-NEXT: lbu a4, 4(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 7(a0)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a4, a6, 24
-; RV32I-NEXT: or a5, a4, a5
+; RV32I-NEXT: lbu a4, 7(a0)
+; RV32I-NEXT: lbu a5, 6(a0)
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: lbu a7, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
+; RV32I-NEXT: slli a5, a4, 16
; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: lbu a5, 1(a1)
-; RV32I-NEXT: lbu a6, 0(a1)
-; RV32I-NEXT: lbu a7, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, a7
-; RV32I-NEXT: or a5, a1, a5
-; RV32I-NEXT: addi a6, a5, -32
-; RV32I-NEXT: sra a1, a3, a5
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a1, a7, a1
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a4, a1, a6
+; RV32I-NEXT: addi a6, a4, -32
+; RV32I-NEXT: sra a1, a3, a4
; RV32I-NEXT: bltz a6, .LBB5_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: srai a4, a4, 31
+; RV32I-NEXT: srai a5, a5, 31
; RV32I-NEXT: mv a0, a1
-; RV32I-NEXT: mv a1, a4
+; RV32I-NEXT: mv a1, a5
; RV32I-NEXT: j .LBB5_3
; RV32I-NEXT: .LBB5_2:
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 1(a0)
; RV32I-NEXT: lbu a6, 0(a0)
-; RV32I-NEXT: lbu a7, 2(a0)
-; RV32I-NEXT: lbu a0, 3(a0)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: or a0, a0, a7
-; RV32I-NEXT: or a0, a0, a4
-; RV32I-NEXT: srl a0, a0, a5
-; RV32I-NEXT: not a4, a5
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a0, a7, a0
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: srl a0, a0, a4
+; RV32I-NEXT: not a4, a4
; RV32I-NEXT: slli a3, a3, 1
; RV32I-NEXT: sll a3, a3, a4
; RV32I-NEXT: or a0, a0, a3
@@ -591,45 +591,45 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 9(a0)
; RV64I-NEXT: lbu a4, 8(a0)
-; RV64I-NEXT: lbu a5, 10(a0)
-; RV64I-NEXT: lbu a6, 11(a0)
+; RV64I-NEXT: lbu a5, 11(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 13(a0)
; RV64I-NEXT: lbu a5, 12(a0)
-; RV64I-NEXT: lbu a6, 14(a0)
-; RV64I-NEXT: lbu a7, 15(a0)
+; RV64I-NEXT: lbu a6, 15(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 1(a1)
; RV64I-NEXT: lbu a5, 0(a1)
-; RV64I-NEXT: lbu a6, 2(a1)
-; RV64I-NEXT: lbu a7, 3(a1)
+; RV64I-NEXT: lbu a6, 3(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: lbu a5, 5(a1)
; RV64I-NEXT: lbu a6, 4(a1)
-; RV64I-NEXT: lbu a7, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: lbu a7, 7(a1)
+; RV64I-NEXT: lbu a1, 6(a1)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a1, a7, a1
+; RV64I-NEXT: slli a1, a1, 16
; RV64I-NEXT: or a1, a1, a5
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a5, a1, a4
@@ -642,23 +642,23 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: .LBB6_2:
; RV64I-NEXT: lbu a6, 1(a0)
; RV64I-NEXT: lbu a7, 0(a0)
-; RV64I-NEXT: lbu t0, 2(a0)
-; RV64I-NEXT: lbu t1, 3(a0)
+; RV64I-NEXT: lbu t0, 3(a0)
+; RV64I-NEXT: lbu t1, 2(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a7, t0, t1
+; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 5(a0)
; RV64I-NEXT: lbu t0, 4(a0)
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu t1, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or a0, t1, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a7
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a6
@@ -728,13 +728,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu s3, 13(a0)
; RV32I-NEXT: slli s0, s0, 8
; RV32I-NEXT: or s0, s0, s1
-; RV32I-NEXT: lbu s1, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu s1, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: lbu s4, 14(a0)
; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a1, s1, a1
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a1, a1, s0
; RV32I-NEXT: sb zero, 43(sp)
; RV32I-NEXT: sb zero, 42(sp)
@@ -774,25 +774,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: add a3, a3, a0
; RV32I-NEXT: lbu a0, 5(a3)
; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
+; RV32I-NEXT: lbu a5, 7(a3)
+; RV32I-NEXT: lbu a6, 6(a3)
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a4, a5, a6
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a5, a4, a0
; RV32I-NEXT: andi a4, a1, 7
; RV32I-NEXT: srl a0, a5, a4
; RV32I-NEXT: lbu a1, 9(a3)
; RV32I-NEXT: lbu a6, 8(a3)
-; RV32I-NEXT: lbu a7, 10(a3)
-; RV32I-NEXT: lbu t0, 11(a3)
+; RV32I-NEXT: lbu a7, 11(a3)
+; RV32I-NEXT: lbu t0, 10(a3)
; RV32I-NEXT: slli a1, a1, 8
; RV32I-NEXT: or a1, a1, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a6, a7, t0
+; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: or a6, a6, a1
; RV32I-NEXT: slli a1, a6, 1
; RV32I-NEXT: not a7, a4
@@ -800,13 +800,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a0, a1
; RV32I-NEXT: lbu a7, 1(a3)
; RV32I-NEXT: lbu t0, 0(a3)
-; RV32I-NEXT: lbu t1, 2(a3)
-; RV32I-NEXT: lbu t2, 3(a3)
+; RV32I-NEXT: lbu t1, 3(a3)
+; RV32I-NEXT: lbu t2, 2(a3)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t1, t2
+; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: or a7, t0, a7
; RV32I-NEXT: srl a7, a7, a4
; RV32I-NEXT: slli a5, a5, 1
@@ -816,13 +816,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srl a6, a6, a4
; RV32I-NEXT: lbu t1, 13(a3)
; RV32I-NEXT: lbu t2, 12(a3)
-; RV32I-NEXT: lbu t3, 14(a3)
-; RV32I-NEXT: lbu a3, 15(a3)
+; RV32I-NEXT: lbu t3, 15(a3)
+; RV32I-NEXT: lbu a3, 14(a3)
; RV32I-NEXT: slli t1, t1, 8
; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a3, t3, a3
+; RV32I-NEXT: slli a3, a3, 16
; RV32I-NEXT: or a3, a3, t1
; RV32I-NEXT: slli t1, a3, 1
; RV32I-NEXT: sll t0, t1, t0
@@ -874,45 +874,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 1(a0)
; RV64I-NEXT: lbu a4, 0(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a5, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 5(a0)
; RV64I-NEXT: lbu a5, 4(a0)
-; RV64I-NEXT: lbu a6, 6(a0)
-; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: lbu a6, 7(a0)
+; RV64I-NEXT: lbu a7, 6(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 1(a1)
; RV64I-NEXT: lbu a5, 0(a1)
-; RV64I-NEXT: lbu a6, 2(a1)
-; RV64I-NEXT: lbu a7, 3(a1)
+; RV64I-NEXT: lbu a6, 3(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: lbu a5, 5(a1)
; RV64I-NEXT: lbu a6, 4(a1)
-; RV64I-NEXT: lbu a7, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: lbu a7, 7(a1)
+; RV64I-NEXT: lbu a1, 6(a1)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a1, a7, a1
+; RV64I-NEXT: slli a1, a1, 16
; RV64I-NEXT: or a1, a1, a5
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a5, a1, a4
@@ -925,23 +925,23 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: .LBB7_2:
; RV64I-NEXT: lbu a6, 9(a0)
; RV64I-NEXT: lbu a7, 8(a0)
-; RV64I-NEXT: lbu t0, 10(a0)
-; RV64I-NEXT: lbu t1, 11(a0)
+; RV64I-NEXT: lbu t0, 11(a0)
+; RV64I-NEXT: lbu t1, 10(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a7, t0, t1
+; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: lbu a7, 13(a0)
; RV64I-NEXT: lbu t0, 12(a0)
-; RV64I-NEXT: lbu t1, 14(a0)
-; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: lbu t1, 15(a0)
+; RV64I-NEXT: lbu a0, 14(a0)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or a0, t1, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a7
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a6
@@ -1011,13 +1011,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu s3, 13(a0)
; RV32I-NEXT: slli s0, s0, 8
; RV32I-NEXT: or s0, s0, s1
-; RV32I-NEXT: lbu s1, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu s1, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: lbu s4, 14(a0)
; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a1, s1, a1
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a1, a1, s0
; RV32I-NEXT: sb zero, 27(sp)
; RV32I-NEXT: sb zero, 26(sp)
@@ -1057,25 +1057,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sub a3, a3, a0
; RV32I-NEXT: lbu a0, 5(a3)
; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
+; RV32I-NEXT: lbu a5, 7(a3)
+; RV32I-NEXT: lbu a6, 6(a3)
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a4, a5, a6
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a5, a4, a0
; RV32I-NEXT: andi a4, a1, 7
; RV32I-NEXT: sll a0, a5, a4
; RV32I-NEXT: lbu a1, 1(a3)
; RV32I-NEXT: lbu a6, 0(a3)
-; RV32I-NEXT: lbu a7, 2(a3)
-; RV32I-NEXT: lbu t0, 3(a3)
+; RV32I-NEXT: lbu a7, 3(a3)
+; RV32I-NEXT: lbu t0, 2(a3)
; RV32I-NEXT: slli a1, a1, 8
; RV32I-NEXT: or a1, a1, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a6, a7, t0
+; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: or a6, a6, a1
; RV32I-NEXT: srli a1, a6, 1
; RV32I-NEXT: xori a7, a4, 31
@@ -1083,24 +1083,24 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a0, a1
; RV32I-NEXT: lbu t0, 13(a3)
; RV32I-NEXT: lbu t1, 12(a3)
-; RV32I-NEXT: lbu t2, 14(a3)
-; RV32I-NEXT: lbu t3, 15(a3)
+; RV32I-NEXT: lbu t2, 15(a3)
+; RV32I-NEXT: lbu t3, 14(a3)
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: or t0, t0, t1
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t3
+; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: or t0, t1, t0
; RV32I-NEXT: sll t0, t0, a4
; RV32I-NEXT: lbu t1, 9(a3)
; RV32I-NEXT: lbu t2, 8(a3)
-; RV32I-NEXT: lbu t3, 10(a3)
-; RV32I-NEXT: lbu a3, 11(a3)
+; RV32I-NEXT: lbu t3, 11(a3)
+; RV32I-NEXT: lbu a3, 10(a3)
; RV32I-NEXT: slli t1, t1, 8
; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a3, t3, a3
+; RV32I-NEXT: slli a3, a3, 16
; RV32I-NEXT: or a3, a3, t1
; RV32I-NEXT: srli t1, a3, 1
; RV32I-NEXT: srl a7, t1, a7
@@ -1157,81 +1157,81 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: lbu a3, 9(a0)
; RV64I-NEXT: lbu a4, 8(a0)
-; RV64I-NEXT: lbu a5, 10(a0)
-; RV64I-NEXT: lbu a6, 11(a0)
+; RV64I-NEXT: lbu a5, 11(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: lbu a4, 13(a0)
; RV64I-NEXT: lbu a5, 12(a0)
-; RV64I-NEXT: lbu a6, 14(a0)
-; RV64I-NEXT: lbu a7, 15(a0)
+; RV64I-NEXT: lbu a6, 15(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: slli a5, a4, 32
-; RV64I-NEXT: or a3, a5, a3
-; RV64I-NEXT: lbu a5, 1(a1)
-; RV64I-NEXT: lbu a6, 0(a1)
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 1(a1)
+; RV64I-NEXT: lbu a5, 0(a1)
+; RV64I-NEXT: lbu a6, 3(a1)
; RV64I-NEXT: lbu a7, 2(a1)
-; RV64I-NEXT: lbu t0, 3(a1)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 5(a1)
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu a7, 7(a1)
+; RV64I-NEXT: lbu a1, 6(a1)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a6, t0, a7
-; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 5(a1)
-; RV64I-NEXT: lbu a7, 4(a1)
-; RV64I-NEXT: lbu t0, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, t0
-; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a1, a7, a1
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, a5
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a5, a1, a5
-; RV64I-NEXT: addi a6, a5, -64
-; RV64I-NEXT: sra a1, a3, a5
-; RV64I-NEXT: bltz a6, .LBB8_2
+; RV64I-NEXT: or a4, a1, a4
+; RV64I-NEXT: addi a5, a4, -64
+; RV64I-NEXT: sra a1, a3, a4
+; RV64I-NEXT: bltz a5, .LBB8_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sraiw a3, a4, 31
+; RV64I-NEXT: srai a3, a3, 63
; RV64I-NEXT: mv a0, a1
; RV64I-NEXT: mv a1, a3
; RV64I-NEXT: j .LBB8_3
; RV64I-NEXT: .LBB8_2:
-; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 1(a0)
; RV64I-NEXT: lbu a6, 0(a0)
-; RV64I-NEXT: lbu a7, 2(a0)
-; RV64I-NEXT: lbu t0, 3(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: or a4, a4, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a6, t0, a7
-; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu t0, 2(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a6, a7, t0
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: lbu a6, 5(a0)
; RV64I-NEXT: lbu a7, 4(a0)
-; RV64I-NEXT: lbu t0, 6(a0)
-; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: lbu t0, 7(a0)
+; RV64I-NEXT: lbu a0, 6(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a0, t0, a0
+; RV64I-NEXT: slli a0, a0, 16
; RV64I-NEXT: or a0, a0, a6
; RV64I-NEXT: slli a0, a0, 32
-; RV64I-NEXT: or a0, a0, a4
-; RV64I-NEXT: srl a0, a0, a5
-; RV64I-NEXT: not a4, a5
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: srl a0, a0, a4
+; RV64I-NEXT: not a4, a4
; RV64I-NEXT: slli a3, a3, 1
; RV64I-NEXT: sll a3, a3, a4
; RV64I-NEXT: or a0, a0, a3
@@ -1278,7 +1278,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s5, 40(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 15(a0)
-; RV32I-NEXT: slli a4, a3, 24
+; RV32I-NEXT: slli a4, a3, 8
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: lbu a5, 0(a0)
; RV32I-NEXT: lbu a6, 1(a0)
; RV32I-NEXT: lbu a7, 2(a0)
@@ -1296,13 +1297,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu s4, 12(a0)
; RV32I-NEXT: slli s1, s1, 8
; RV32I-NEXT: or s1, s1, s2
-; RV32I-NEXT: lbu s2, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu s2, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: lbu s5, 13(a0)
; RV32I-NEXT: lbu a0, 14(a0)
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s2
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: or a1, s2, a1
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a1, a1, s1
; RV32I-NEXT: sb a3, 23(sp)
; RV32I-NEXT: sb a0, 22(sp)
@@ -1346,25 +1347,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: add a3, a3, a0
; RV32I-NEXT: lbu a0, 5(a3)
; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
+; RV32I-NEXT: lbu a5, 7(a3)
+; RV32I-NEXT: lbu a6, 6(a3)
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a4, a5, a6
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a5, a4, a0
; RV32I-NEXT: andi a4, a1, 7
; RV32I-NEXT: srl a0, a5, a4
; RV32I-NEXT: lbu a1, 9(a3)
; RV32I-NEXT: lbu a6, 8(a3)
-; RV32I-NEXT: lbu a7, 10(a3)
-; RV32I-NEXT: lbu t0, 11(a3)
+; RV32I-NEXT: lbu a7, 11(a3)
+; RV32I-NEXT: lbu t0, 10(a3)
; RV32I-NEXT: slli a1, a1, 8
; RV32I-NEXT: or a1, a1, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a6, a7, t0
+; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: or a6, a6, a1
; RV32I-NEXT: slli a1, a6, 1
; RV32I-NEXT: not a7, a4
@@ -1372,14 +1373,14 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a0, a1
; RV32I-NEXT: lbu a7, 1(a3)
; RV32I-NEXT: lbu t0, 0(a3)
-; RV32I-NEXT: lbu t1, 2(a3)
-; RV32I-NEXT: lbu t2, 3(a3)
+; RV32I-NEXT: lbu t1, 3(a3)
+; RV32I-NEXT: lbu t2, 2(a3)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t1, t2
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: or a7, t0, a7
; RV32I-NEXT: srl a7, a7, a4
; RV32I-NEXT: slli a5, a5, 1
; RV32I-NEXT: xori t0, a4, 31
@@ -1388,13 +1389,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srl a6, a6, a4
; RV32I-NEXT: lbu t1, 13(a3)
; RV32I-NEXT: lbu t2, 12(a3)
-; RV32I-NEXT: lbu t3, 14(a3)
-; RV32I-NEXT: lbu a3, 15(a3)
+; RV32I-NEXT: lbu t3, 15(a3)
+; RV32I-NEXT: lbu a3, 14(a3)
; RV32I-NEXT: slli t1, t1, 8
; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a3, t3, a3
+; RV32I-NEXT: slli a3, a3, 16
; RV32I-NEXT: or a3, a3, t1
; RV32I-NEXT: slli t1, a3, 1
; RV32I-NEXT: sll t0, t1, t0
@@ -1446,332 +1447,388 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: lshr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a5, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
+; RV64I-NEXT: slli a3, a5, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli t0, a7, 8
+; RV64I-NEXT: or t0, t0, a6
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 4(a0)
; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
+; RV64I-NEXT: lbu t3, 6(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t1, t2, t3
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: slli t0, t0, 32
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t1, 9(a0)
+; RV64I-NEXT: lbu t0, 8(a0)
+; RV64I-NEXT: lbu t4, 11(a0)
+; RV64I-NEXT: lbu t3, 10(a0)
+; RV64I-NEXT: slli t2, t1, 8
+; RV64I-NEXT: or t2, t2, t0
+; RV64I-NEXT: slli t5, t4, 8
+; RV64I-NEXT: or t5, t5, t3
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or t2, t5, t2
+; RV64I-NEXT: lbu t5, 13(a0)
+; RV64I-NEXT: lbu t6, 12(a0)
+; RV64I-NEXT: lbu s0, 15(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: or t5, t5, t6
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: or s0, s0, s1
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t5, s0, t5
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: or t2, t5, t2
+; RV64I-NEXT: lbu t6, 17(a0)
+; RV64I-NEXT: lbu t5, 16(a0)
+; RV64I-NEXT: lbu s2, 19(a0)
+; RV64I-NEXT: lbu s1, 18(a0)
+; RV64I-NEXT: slli s0, t6, 8
+; RV64I-NEXT: or s0, s0, t5
+; RV64I-NEXT: slli s3, s2, 8
+; RV64I-NEXT: or s3, s3, s1
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or s0, s3, s0
+; RV64I-NEXT: lbu s3, 21(a0)
+; RV64I-NEXT: lbu s4, 20(a0)
+; RV64I-NEXT: lbu s5, 23(a0)
+; RV64I-NEXT: lbu s6, 22(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: or s3, s3, s4
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s4, s5, s6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s3, s4, s3
+; RV64I-NEXT: slli s3, s3, 32
+; RV64I-NEXT: or s0, s3, s0
+; RV64I-NEXT: lbu s4, 25(a0)
+; RV64I-NEXT: lbu s3, 24(a0)
+; RV64I-NEXT: lbu s6, 27(a0)
+; RV64I-NEXT: lbu s5, 26(a0)
+; RV64I-NEXT: slli s7, s4, 8
+; RV64I-NEXT: or s7, s7, s3
+; RV64I-NEXT: slli s8, s6, 8
+; RV64I-NEXT: or s8, s8, s5
+; RV64I-NEXT: slli s8, s8, 16
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: lbu s8, 29(a0)
+; RV64I-NEXT: lbu s9, 28(a0)
+; RV64I-NEXT: lbu s10, 31(a0)
+; RV64I-NEXT: lbu a0, 30(a0)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s8, s8, s9
+; RV64I-NEXT: slli s10, s10, 8
+; RV64I-NEXT: or a0, s10, a0
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, s8
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or s7, a0, s7
+; RV64I-NEXT: lbu a0, 1(a1)
+; RV64I-NEXT: lbu s8, 0(a1)
+; RV64I-NEXT: lbu s9, 3(a1)
+; RV64I-NEXT: lbu s10, 2(a1)
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: or a0, a0, s8
; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s9, s9, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or s11, s11, s9
+; RV64I-NEXT: or s8, s9, s10
+; RV64I-NEXT: slli s8, s8, 16
+; RV64I-NEXT: or a0, s8, a0
+; RV64I-NEXT: lbu s8, 5(a1)
; RV64I-NEXT: lbu s9, 4(a1)
+; RV64I-NEXT: lbu s10, 7(a1)
+; RV64I-NEXT: lbu a1, 6(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s8, s8, s9
; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s9
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 22(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 23(a0)
+; RV64I-NEXT: or a1, s10, a1
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, s8
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t0, a1, s11
-; RV64I-NEXT: lbu s11, 24(a0)
-; RV64I-NEXT: lbu a7, 25(a0)
-; RV64I-NEXT: lbu a6, 26(a0)
-; RV64I-NEXT: lbu a5, 27(a0)
-; RV64I-NEXT: lbu a1, 31(a0)
-; RV64I-NEXT: lbu a3, 30(a0)
-; RV64I-NEXT: lbu a4, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: sb a1, 87(sp)
-; RV64I-NEXT: sb a3, 86(sp)
-; RV64I-NEXT: sb a4, 85(sp)
-; RV64I-NEXT: sb a0, 84(sp)
-; RV64I-NEXT: sb a5, 83(sp)
-; RV64I-NEXT: sb a6, 82(sp)
-; RV64I-NEXT: sb a7, 81(sp)
-; RV64I-NEXT: sb s11, 80(sp)
-; RV64I-NEXT: sb s10, 79(sp)
-; RV64I-NEXT: sb ra, 78(sp)
-; RV64I-NEXT: sb s9, 77(sp)
-; RV64I-NEXT: sb s8, 76(sp)
-; RV64I-NEXT: sb s7, 75(sp)
-; RV64I-NEXT: sb s6, 74(sp)
-; RV64I-NEXT: sb s5, 73(sp)
-; RV64I-NEXT: sb s4, 72(sp)
-; RV64I-NEXT: sb s3, 71(sp)
-; RV64I-NEXT: sb s2, 70(sp)
-; RV64I-NEXT: sb s1, 69(sp)
-; RV64I-NEXT: sb s0, 68(sp)
-; RV64I-NEXT: sb t6, 67(sp)
-; RV64I-NEXT: sb t5, 66(sp)
-; RV64I-NEXT: sb t4, 65(sp)
-; RV64I-NEXT: sb zero, 119(sp)
-; RV64I-NEXT: sb zero, 118(sp)
-; RV64I-NEXT: sb zero, 117(sp)
-; RV64I-NEXT: sb zero, 116(sp)
-; RV64I-NEXT: sb zero, 115(sp)
-; RV64I-NEXT: sb zero, 114(sp)
-; RV64I-NEXT: sb zero, 113(sp)
-; RV64I-NEXT: sb zero, 112(sp)
-; RV64I-NEXT: sb zero, 111(sp)
-; RV64I-NEXT: sb zero, 110(sp)
-; RV64I-NEXT: sb zero, 109(sp)
-; RV64I-NEXT: sb zero, 108(sp)
-; RV64I-NEXT: sb zero, 107(sp)
-; RV64I-NEXT: sb zero, 106(sp)
-; RV64I-NEXT: sb zero, 105(sp)
-; RV64I-NEXT: sb zero, 104(sp)
-; RV64I-NEXT: sb zero, 103(sp)
-; RV64I-NEXT: sb zero, 102(sp)
-; RV64I-NEXT: sb zero, 101(sp)
-; RV64I-NEXT: sb zero, 100(sp)
-; RV64I-NEXT: sb zero, 99(sp)
-; RV64I-NEXT: sb zero, 98(sp)
-; RV64I-NEXT: sb zero, 97(sp)
-; RV64I-NEXT: sb zero, 96(sp)
-; RV64I-NEXT: sb zero, 95(sp)
-; RV64I-NEXT: sb zero, 94(sp)
-; RV64I-NEXT: sb zero, 93(sp)
-; RV64I-NEXT: sb zero, 92(sp)
-; RV64I-NEXT: sb zero, 91(sp)
-; RV64I-NEXT: sb zero, 90(sp)
-; RV64I-NEXT: sb zero, 89(sp)
-; RV64I-NEXT: sb zero, 88(sp)
-; RV64I-NEXT: sb t3, 64(sp)
-; RV64I-NEXT: sb t2, 63(sp)
-; RV64I-NEXT: sb t1, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: slli a0, t0, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a3, sp, 56
-; RV64I-NEXT: add a3, a3, a0
-; RV64I-NEXT: lbu a0, 9(a3)
-; RV64I-NEXT: lbu a1, 8(a3)
-; RV64I-NEXT: lbu a4, 10(a3)
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: sb zero, 71(sp)
+; RV64I-NEXT: sb zero, 70(sp)
+; RV64I-NEXT: sb zero, 69(sp)
+; RV64I-NEXT: sb zero, 68(sp)
+; RV64I-NEXT: sb zero, 67(sp)
+; RV64I-NEXT: sb zero, 66(sp)
+; RV64I-NEXT: sb zero, 65(sp)
+; RV64I-NEXT: sb zero, 64(sp)
+; RV64I-NEXT: sb zero, 63(sp)
+; RV64I-NEXT: sb zero, 62(sp)
+; RV64I-NEXT: sb zero, 61(sp)
+; RV64I-NEXT: sb zero, 60(sp)
+; RV64I-NEXT: sb zero, 59(sp)
+; RV64I-NEXT: sb zero, 58(sp)
+; RV64I-NEXT: sb zero, 57(sp)
+; RV64I-NEXT: sb zero, 56(sp)
+; RV64I-NEXT: sb zero, 55(sp)
+; RV64I-NEXT: sb zero, 54(sp)
+; RV64I-NEXT: sb zero, 53(sp)
+; RV64I-NEXT: sb zero, 52(sp)
+; RV64I-NEXT: sb zero, 51(sp)
+; RV64I-NEXT: sb zero, 50(sp)
+; RV64I-NEXT: sb zero, 49(sp)
+; RV64I-NEXT: sb zero, 48(sp)
+; RV64I-NEXT: sb zero, 47(sp)
+; RV64I-NEXT: sb zero, 46(sp)
+; RV64I-NEXT: sb zero, 45(sp)
+; RV64I-NEXT: sb zero, 44(sp)
+; RV64I-NEXT: sb zero, 43(sp)
+; RV64I-NEXT: sb zero, 42(sp)
+; RV64I-NEXT: sb zero, 41(sp)
+; RV64I-NEXT: sb zero, 40(sp)
+; RV64I-NEXT: sb s6, 35(sp)
+; RV64I-NEXT: sb s5, 34(sp)
+; RV64I-NEXT: sb s4, 33(sp)
+; RV64I-NEXT: sb s3, 32(sp)
+; RV64I-NEXT: sb s2, 27(sp)
+; RV64I-NEXT: sb s1, 26(sp)
+; RV64I-NEXT: sb t6, 25(sp)
+; RV64I-NEXT: sb t5, 24(sp)
+; RV64I-NEXT: sb t4, 19(sp)
+; RV64I-NEXT: sb t3, 18(sp)
+; RV64I-NEXT: sb t1, 17(sp)
+; RV64I-NEXT: sb t0, 16(sp)
+; RV64I-NEXT: sb a7, 11(sp)
+; RV64I-NEXT: sb a6, 10(sp)
+; RV64I-NEXT: sb a5, 9(sp)
+; RV64I-NEXT: sb a4, 8(sp)
+; RV64I-NEXT: srli a1, s7, 56
+; RV64I-NEXT: sb a1, 39(sp)
+; RV64I-NEXT: srli a1, s7, 48
+; RV64I-NEXT: sb a1, 38(sp)
+; RV64I-NEXT: srli a1, s7, 40
+; RV64I-NEXT: sb a1, 37(sp)
+; RV64I-NEXT: srli a1, s7, 32
+; RV64I-NEXT: sb a1, 36(sp)
+; RV64I-NEXT: srli a1, s0, 56
+; RV64I-NEXT: sb a1, 31(sp)
+; RV64I-NEXT: srli a1, s0, 48
+; RV64I-NEXT: sb a1, 30(sp)
+; RV64I-NEXT: srli a1, s0, 40
+; RV64I-NEXT: sb a1, 29(sp)
+; RV64I-NEXT: srli s0, s0, 32
+; RV64I-NEXT: sb s0, 28(sp)
+; RV64I-NEXT: srli a1, t2, 56
+; RV64I-NEXT: sb a1, 23(sp)
+; RV64I-NEXT: srli a1, t2, 48
+; RV64I-NEXT: sb a1, 22(sp)
+; RV64I-NEXT: srli a1, t2, 40
+; RV64I-NEXT: sb a1, 21(sp)
+; RV64I-NEXT: srli a1, t2, 32
+; RV64I-NEXT: sb a1, 20(sp)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 15(sp)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 14(sp)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 13(sp)
+; RV64I-NEXT: srli a3, a3, 32
+; RV64I-NEXT: sb a3, 12(sp)
+; RV64I-NEXT: slli a1, a0, 56
+; RV64I-NEXT: srli a1, a1, 59
+; RV64I-NEXT: addi a3, sp, 8
+; RV64I-NEXT: add a3, a3, a1
+; RV64I-NEXT: lbu a1, 9(a3)
+; RV64I-NEXT: lbu a4, 8(a3)
; RV64I-NEXT: lbu a5, 11(a3)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a0, a4, a0
-; RV64I-NEXT: lbu a1, 13(a3)
-; RV64I-NEXT: lbu a4, 12(a3)
-; RV64I-NEXT: lbu a5, 14(a3)
-; RV64I-NEXT: lbu a6, 15(a3)
+; RV64I-NEXT: lbu a6, 10(a3)
; RV64I-NEXT: slli a1, a1, 8
; RV64I-NEXT: or a1, a1, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
+; RV64I-NEXT: slli a4, a4, 16
; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a4, a1, a0
-; RV64I-NEXT: andi a1, t0, 7
-; RV64I-NEXT: lbu a0, 17(a3)
-; RV64I-NEXT: lbu a5, 16(a3)
-; RV64I-NEXT: lbu a6, 18(a3)
+; RV64I-NEXT: lbu a4, 13(a3)
+; RV64I-NEXT: lbu a5, 12(a3)
+; RV64I-NEXT: lbu a6, 15(a3)
+; RV64I-NEXT: lbu a7, 14(a3)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a5, a4, a1
+; RV64I-NEXT: andi a4, a0, 7
+; RV64I-NEXT: srl a0, a5, a4
+; RV64I-NEXT: lbu a1, 17(a3)
+; RV64I-NEXT: lbu a6, 16(a3)
; RV64I-NEXT: lbu a7, 19(a3)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: lbu t0, 18(a3)
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a6, a7, t0
; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
-; RV64I-NEXT: or a0, a5, a0
-; RV64I-NEXT: lbu a5, 21(a3)
-; RV64I-NEXT: lbu a6, 20(a3)
-; RV64I-NEXT: lbu a7, 22(a3)
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: lbu a6, 21(a3)
+; RV64I-NEXT: lbu a7, 20(a3)
; RV64I-NEXT: lbu t0, 23(a3)
-; RV64I-NEXT: slli a5, a5, 8
-; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: lbu t1, 22(a3)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a7, t0, t1
; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a6, t0, a7
-; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: slli a5, a5, 32
-; RV64I-NEXT: or a5, a5, a0
-; RV64I-NEXT: slli a0, a5, 1
-; RV64I-NEXT: not a6, a1
-; RV64I-NEXT: sll a0, a0, a6
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a7, a6, a1
+; RV64I-NEXT: slli a1, a7, 1
+; RV64I-NEXT: not a6, a4
+; RV64I-NEXT: sll a1, a1, a6
+; RV64I-NEXT: or a1, a0, a1
; RV64I-NEXT: lbu a6, 1(a3)
-; RV64I-NEXT: lbu a7, 0(a3)
-; RV64I-NEXT: lbu t0, 2(a3)
+; RV64I-NEXT: lbu t0, 0(a3)
; RV64I-NEXT: lbu t1, 3(a3)
+; RV64I-NEXT: lbu t2, 2(a3)
; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: or a6, a6, t0
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or t0, t1, t2
; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a7, t1, t0
-; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 5(a3)
-; RV64I-NEXT: lbu t0, 4(a3)
-; RV64I-NEXT: lbu t1, 6(a3)
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: lbu t0, 5(a3)
+; RV64I-NEXT: lbu t1, 4(a3)
; RV64I-NEXT: lbu t2, 7(a3)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: slli a7, a7, 32
-; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 25(a3)
-; RV64I-NEXT: lbu t0, 24(a3)
-; RV64I-NEXT: lbu t1, 26(a3)
-; RV64I-NEXT: lbu t2, 27(a3)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: lbu t0, 29(a3)
-; RV64I-NEXT: lbu t1, 28(a3)
-; RV64I-NEXT: lbu t2, 30(a3)
-; RV64I-NEXT: lbu a3, 31(a3)
+; RV64I-NEXT: lbu t3, 6(a3)
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t1, t2, t3
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: slli t0, t0, 32
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: srl a6, a6, a4
+; RV64I-NEXT: slli a5, a5, 1
+; RV64I-NEXT: xori t0, a4, 63
+; RV64I-NEXT: sll a5, a5, t0
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: srl a7, a7, a4
+; RV64I-NEXT: lbu t1, 25(a3)
+; RV64I-NEXT: lbu t2, 24(a3)
+; RV64I-NEXT: lbu t3, 27(a3)
+; RV64I-NEXT: lbu t4, 26(a3)
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or t1, t1, t2
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: or t2, t3, t4
; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: slli a3, a3, 24
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: lbu t2, 29(a3)
+; RV64I-NEXT: lbu t3, 28(a3)
+; RV64I-NEXT: lbu t4, 31(a3)
+; RV64I-NEXT: lbu a3, 30(a3)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t2, t2, t3
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a3, t4, a3
+; RV64I-NEXT: slli a3, a3, 16
; RV64I-NEXT: or a3, a3, t2
-; RV64I-NEXT: slli t1, a4, 1
-; RV64I-NEXT: or a3, a3, t0
-; RV64I-NEXT: xori t0, a1, 63
-; RV64I-NEXT: sll t1, t1, t0
; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a7, a3, a7
-; RV64I-NEXT: slli a3, a7, 1
-; RV64I-NEXT: sll t0, a3, t0
-; RV64I-NEXT: srl a3, a4, a1
-; RV64I-NEXT: srl a4, a6, a1
-; RV64I-NEXT: srl a5, a5, a1
-; RV64I-NEXT: srl a1, a7, a1
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 21(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 19(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 18(a2)
-; RV64I-NEXT: or a6, a5, t0
-; RV64I-NEXT: sb a5, 16(a2)
-; RV64I-NEXT: srli a5, a5, 8
-; RV64I-NEXT: sb a5, 17(a2)
-; RV64I-NEXT: srli a5, a1, 56
-; RV64I-NEXT: sb a5, 31(a2)
-; RV64I-NEXT: srli a5, a1, 48
-; RV64I-NEXT: sb a5, 30(a2)
-; RV64I-NEXT: srli a5, a1, 40
-; RV64I-NEXT: sb a5, 29(a2)
-; RV64I-NEXT: srli a5, a1, 32
-; RV64I-NEXT: sb a5, 28(a2)
-; RV64I-NEXT: srli a5, a1, 24
-; RV64I-NEXT: sb a5, 27(a2)
-; RV64I-NEXT: srli a5, a1, 16
-; RV64I-NEXT: sb a5, 26(a2)
-; RV64I-NEXT: sb a1, 24(a2)
-; RV64I-NEXT: srli a1, a1, 8
-; RV64I-NEXT: sb a1, 25(a2)
-; RV64I-NEXT: srli a1, a4, 48
-; RV64I-NEXT: sb a1, 6(a2)
-; RV64I-NEXT: srli a1, a4, 40
-; RV64I-NEXT: sb a1, 5(a2)
-; RV64I-NEXT: srli a1, a4, 32
-; RV64I-NEXT: sb a1, 4(a2)
-; RV64I-NEXT: srli a1, a4, 24
-; RV64I-NEXT: sb a1, 3(a2)
-; RV64I-NEXT: srli a1, a4, 16
-; RV64I-NEXT: sb a1, 2(a2)
-; RV64I-NEXT: or a1, a4, t1
-; RV64I-NEXT: sb a4, 0(a2)
-; RV64I-NEXT: srli a4, a4, 8
-; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: or a3, a3, t1
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll t0, t1, t0
+; RV64I-NEXT: or t0, a7, t0
+; RV64I-NEXT: srl a3, a3, a4
+; RV64I-NEXT: sb a7, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb a6, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a4, a7, 48
+; RV64I-NEXT: sb a4, 22(a2)
+; RV64I-NEXT: srli a4, a7, 40
+; RV64I-NEXT: sb a4, 21(a2)
+; RV64I-NEXT: srli a4, a7, 32
+; RV64I-NEXT: sb a4, 20(a2)
+; RV64I-NEXT: srli a4, a7, 24
+; RV64I-NEXT: sb a4, 19(a2)
+; RV64I-NEXT: srli a4, a7, 16
+; RV64I-NEXT: sb a4, 18(a2)
+; RV64I-NEXT: srli a4, a7, 8
+; RV64I-NEXT: sb a4, 17(a2)
+; RV64I-NEXT: srli a4, a3, 56
+; RV64I-NEXT: sb a4, 31(a2)
; RV64I-NEXT: srli a4, a3, 48
-; RV64I-NEXT: sb a4, 14(a2)
+; RV64I-NEXT: sb a4, 30(a2)
; RV64I-NEXT: srli a4, a3, 40
-; RV64I-NEXT: sb a4, 13(a2)
+; RV64I-NEXT: sb a4, 29(a2)
; RV64I-NEXT: srli a4, a3, 32
-; RV64I-NEXT: sb a4, 12(a2)
+; RV64I-NEXT: sb a4, 28(a2)
; RV64I-NEXT: srli a4, a3, 24
-; RV64I-NEXT: sb a4, 11(a2)
+; RV64I-NEXT: sb a4, 27(a2)
; RV64I-NEXT: srli a4, a3, 16
-; RV64I-NEXT: sb a4, 10(a2)
-; RV64I-NEXT: or a0, a3, a0
-; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 26(a2)
; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 9(a2)
-; RV64I-NEXT: srli a3, a6, 56
-; RV64I-NEXT: sb a3, 23(a2)
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a3, a6, 48
+; RV64I-NEXT: sb a3, 6(a2)
+; RV64I-NEXT: srli a3, a6, 40
+; RV64I-NEXT: sb a3, 5(a2)
+; RV64I-NEXT: srli a3, a6, 32
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: srli a3, a6, 24
+; RV64I-NEXT: sb a3, 3(a2)
+; RV64I-NEXT: srli a3, a6, 16
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: srli a3, a6, 8
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: srli a3, a0, 48
+; RV64I-NEXT: sb a3, 14(a2)
+; RV64I-NEXT: srli a3, a0, 40
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: srli a3, a0, 32
+; RV64I-NEXT: sb a3, 12(a2)
+; RV64I-NEXT: srli a3, a0, 24
+; RV64I-NEXT: sb a3, 11(a2)
+; RV64I-NEXT: srli a3, a0, 16
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: srli a0, t0, 56
+; RV64I-NEXT: sb a0, 23(a2)
+; RV64I-NEXT: srli a5, a5, 56
+; RV64I-NEXT: sb a5, 7(a2)
; RV64I-NEXT: srli a1, a1, 56
-; RV64I-NEXT: sb a1, 7(a2)
-; RV64I-NEXT: srli a0, a0, 56
-; RV64I-NEXT: sb a0, 15(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_32bytes:
@@ -1821,14 +1878,14 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu s9, 21(a0)
; RV32I-NEXT: lbu s11, 0(a1)
; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu ra, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: or s10, s10, s11
; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: slli ra, ra, 8
+; RV32I-NEXT: or a1, ra, a1
; RV32I-NEXT: lbu ra, 23(a0)
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or t0, a1, s10
; RV32I-NEXT: lbu s10, 24(a0)
; RV32I-NEXT: lbu a7, 25(a0)
@@ -1914,82 +1971,82 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: add a4, a4, a0
; RV32I-NEXT: lbu a0, 5(a4)
; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
+; RV32I-NEXT: lbu a3, 7(a4)
+; RV32I-NEXT: lbu a5, 6(a4)
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
; RV32I-NEXT: or t5, a3, a0
; RV32I-NEXT: andi a3, t0, 7
; RV32I-NEXT: lbu a0, 9(a4)
; RV32I-NEXT: lbu a1, 8(a4)
-; RV32I-NEXT: lbu a5, 10(a4)
-; RV32I-NEXT: lbu a6, 11(a4)
+; RV32I-NEXT: lbu a5, 11(a4)
+; RV32I-NEXT: lbu a6, 10(a4)
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a1, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a1, a5, a6
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a6, a1, a0
; RV32I-NEXT: slli a0, a6, 1
; RV32I-NEXT: not t1, a3
; RV32I-NEXT: sll a0, a0, t1
; RV32I-NEXT: lbu a1, 1(a4)
; RV32I-NEXT: lbu a5, 0(a4)
-; RV32I-NEXT: lbu a7, 2(a4)
-; RV32I-NEXT: lbu t0, 3(a4)
+; RV32I-NEXT: lbu a7, 3(a4)
+; RV32I-NEXT: lbu t0, 2(a4)
; RV32I-NEXT: slli a1, a1, 8
; RV32I-NEXT: or a1, a1, a5
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a5, a7, t0
+; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: or t0, a5, a1
; RV32I-NEXT: slli a1, t5, 1
; RV32I-NEXT: xori t2, a3, 31
; RV32I-NEXT: sll a1, a1, t2
; RV32I-NEXT: lbu a5, 13(a4)
; RV32I-NEXT: lbu a7, 12(a4)
-; RV32I-NEXT: lbu t3, 14(a4)
-; RV32I-NEXT: lbu t4, 15(a4)
+; RV32I-NEXT: lbu t3, 15(a4)
+; RV32I-NEXT: lbu t4, 14(a4)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a7, t3, t4
+; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: or t3, a7, a5
; RV32I-NEXT: lbu a5, 17(a4)
; RV32I-NEXT: lbu a7, 16(a4)
-; RV32I-NEXT: lbu t4, 18(a4)
-; RV32I-NEXT: lbu t6, 19(a4)
+; RV32I-NEXT: lbu t4, 19(a4)
+; RV32I-NEXT: lbu t6, 18(a4)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a7, t6, t4
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: or a7, t4, t6
+; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: or t4, a7, a5
; RV32I-NEXT: slli a5, t4, 1
; RV32I-NEXT: sll a7, a5, t1
; RV32I-NEXT: lbu a5, 21(a4)
; RV32I-NEXT: lbu t6, 20(a4)
-; RV32I-NEXT: lbu s0, 22(a4)
-; RV32I-NEXT: lbu s1, 23(a4)
+; RV32I-NEXT: lbu s0, 23(a4)
+; RV32I-NEXT: lbu s1, 22(a4)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, t6
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: or s0, s0, s1
; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
; RV32I-NEXT: or s0, s0, a5
; RV32I-NEXT: lbu a5, 25(a4)
; RV32I-NEXT: lbu t6, 24(a4)
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu s2, 27(a4)
+; RV32I-NEXT: lbu s1, 27(a4)
+; RV32I-NEXT: lbu s2, 26(a4)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or t6, s1, s2
+; RV32I-NEXT: slli t6, t6, 16
; RV32I-NEXT: or t6, t6, a5
; RV32I-NEXT: lbu a5, 29(a4)
; RV32I-NEXT: lbu s1, 28(a4)
@@ -1997,15 +2054,15 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sll t1, s2, t1
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, s1
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu a4, 31(a4)
+; RV32I-NEXT: lbu s1, 31(a4)
+; RV32I-NEXT: lbu a4, 30(a4)
; RV32I-NEXT: slli s2, t3, 1
; RV32I-NEXT: sll s2, s2, t2
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a4, s1, a4
; RV32I-NEXT: slli s1, s0, 1
; RV32I-NEXT: sll s1, s1, t2
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or s3, a4, a5
; RV32I-NEXT: slli a4, s3, 1
; RV32I-NEXT: sll t2, a4, t2
@@ -2104,332 +2161,388 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a5, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
+; RV64I-NEXT: slli a3, a5, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli t0, a7, 8
+; RV64I-NEXT: or t0, t0, a6
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 4(a0)
; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
+; RV64I-NEXT: lbu t3, 6(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t1, t2, t3
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: slli t0, t0, 32
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t1, 9(a0)
+; RV64I-NEXT: lbu t0, 8(a0)
+; RV64I-NEXT: lbu t4, 11(a0)
+; RV64I-NEXT: lbu t3, 10(a0)
+; RV64I-NEXT: slli t2, t1, 8
+; RV64I-NEXT: or t2, t2, t0
+; RV64I-NEXT: slli t5, t4, 8
+; RV64I-NEXT: or t5, t5, t3
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or t2, t5, t2
+; RV64I-NEXT: lbu t5, 13(a0)
+; RV64I-NEXT: lbu t6, 12(a0)
+; RV64I-NEXT: lbu s0, 15(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: or t5, t5, t6
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: or s0, s0, s1
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t5, s0, t5
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: or t2, t5, t2
+; RV64I-NEXT: lbu t6, 17(a0)
+; RV64I-NEXT: lbu t5, 16(a0)
+; RV64I-NEXT: lbu s2, 19(a0)
+; RV64I-NEXT: lbu s1, 18(a0)
+; RV64I-NEXT: slli s0, t6, 8
+; RV64I-NEXT: or s0, s0, t5
+; RV64I-NEXT: slli s3, s2, 8
+; RV64I-NEXT: or s3, s3, s1
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or s0, s3, s0
+; RV64I-NEXT: lbu s3, 21(a0)
+; RV64I-NEXT: lbu s4, 20(a0)
+; RV64I-NEXT: lbu s5, 23(a0)
+; RV64I-NEXT: lbu s6, 22(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: or s3, s3, s4
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s4, s5, s6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s3, s4, s3
+; RV64I-NEXT: slli s3, s3, 32
+; RV64I-NEXT: or s0, s3, s0
+; RV64I-NEXT: lbu s4, 25(a0)
+; RV64I-NEXT: lbu s3, 24(a0)
+; RV64I-NEXT: lbu s6, 27(a0)
+; RV64I-NEXT: lbu s5, 26(a0)
+; RV64I-NEXT: slli s7, s4, 8
+; RV64I-NEXT: or s7, s7, s3
+; RV64I-NEXT: slli s8, s6, 8
+; RV64I-NEXT: or s8, s8, s5
+; RV64I-NEXT: slli s8, s8, 16
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: lbu s8, 29(a0)
+; RV64I-NEXT: lbu s9, 28(a0)
+; RV64I-NEXT: lbu s10, 31(a0)
+; RV64I-NEXT: lbu a0, 30(a0)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s8, s8, s9
+; RV64I-NEXT: slli s10, s10, 8
+; RV64I-NEXT: or a0, s10, a0
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, s8
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or s7, a0, s7
+; RV64I-NEXT: lbu a0, 1(a1)
+; RV64I-NEXT: lbu s8, 0(a1)
+; RV64I-NEXT: lbu s9, 3(a1)
+; RV64I-NEXT: lbu s10, 2(a1)
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: or a0, a0, s8
; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s9, s9, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or s11, s11, s9
+; RV64I-NEXT: or s8, s9, s10
+; RV64I-NEXT: slli s8, s8, 16
+; RV64I-NEXT: or a0, s8, a0
+; RV64I-NEXT: lbu s8, 5(a1)
; RV64I-NEXT: lbu s9, 4(a1)
+; RV64I-NEXT: lbu s10, 7(a1)
+; RV64I-NEXT: lbu a1, 6(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s8, s8, s9
; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s9
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 22(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 23(a0)
+; RV64I-NEXT: or a1, s10, a1
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, s8
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t0, a1, s11
-; RV64I-NEXT: lbu s11, 24(a0)
-; RV64I-NEXT: lbu a7, 25(a0)
-; RV64I-NEXT: lbu a6, 26(a0)
-; RV64I-NEXT: lbu a5, 27(a0)
-; RV64I-NEXT: lbu a1, 31(a0)
-; RV64I-NEXT: lbu a3, 30(a0)
-; RV64I-NEXT: lbu a4, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: sb a0, 116(sp)
-; RV64I-NEXT: sb a5, 115(sp)
-; RV64I-NEXT: sb a6, 114(sp)
-; RV64I-NEXT: sb a7, 113(sp)
-; RV64I-NEXT: sb s11, 112(sp)
-; RV64I-NEXT: sb s10, 111(sp)
-; RV64I-NEXT: sb ra, 110(sp)
-; RV64I-NEXT: sb s9, 109(sp)
-; RV64I-NEXT: sb s8, 108(sp)
-; RV64I-NEXT: sb s7, 107(sp)
-; RV64I-NEXT: sb s6, 106(sp)
-; RV64I-NEXT: sb s5, 105(sp)
-; RV64I-NEXT: sb s4, 104(sp)
-; RV64I-NEXT: sb s3, 103(sp)
-; RV64I-NEXT: sb s2, 102(sp)
-; RV64I-NEXT: sb s1, 101(sp)
-; RV64I-NEXT: sb s0, 100(sp)
-; RV64I-NEXT: sb t6, 99(sp)
-; RV64I-NEXT: sb t5, 98(sp)
-; RV64I-NEXT: sb t4, 97(sp)
-; RV64I-NEXT: sb t3, 96(sp)
-; RV64I-NEXT: sb zero, 87(sp)
-; RV64I-NEXT: sb zero, 86(sp)
-; RV64I-NEXT: sb zero, 85(sp)
-; RV64I-NEXT: sb zero, 84(sp)
-; RV64I-NEXT: sb zero, 83(sp)
-; RV64I-NEXT: sb zero, 82(sp)
-; RV64I-NEXT: sb zero, 81(sp)
-; RV64I-NEXT: sb zero, 80(sp)
-; RV64I-NEXT: sb zero, 79(sp)
-; RV64I-NEXT: sb zero, 78(sp)
-; RV64I-NEXT: sb zero, 77(sp)
-; RV64I-NEXT: sb zero, 76(sp)
-; RV64I-NEXT: sb zero, 75(sp)
-; RV64I-NEXT: sb zero, 74(sp)
-; RV64I-NEXT: sb zero, 73(sp)
-; RV64I-NEXT: sb zero, 72(sp)
-; RV64I-NEXT: sb zero, 71(sp)
-; RV64I-NEXT: sb zero, 70(sp)
-; RV64I-NEXT: sb zero, 69(sp)
-; RV64I-NEXT: sb zero, 68(sp)
-; RV64I-NEXT: sb zero, 67(sp)
-; RV64I-NEXT: sb zero, 66(sp)
-; RV64I-NEXT: sb zero, 65(sp)
-; RV64I-NEXT: sb zero, 64(sp)
-; RV64I-NEXT: sb zero, 63(sp)
-; RV64I-NEXT: sb zero, 62(sp)
-; RV64I-NEXT: sb zero, 61(sp)
-; RV64I-NEXT: sb zero, 60(sp)
-; RV64I-NEXT: sb zero, 59(sp)
-; RV64I-NEXT: sb zero, 58(sp)
-; RV64I-NEXT: sb zero, 57(sp)
-; RV64I-NEXT: sb zero, 56(sp)
-; RV64I-NEXT: sb t2, 95(sp)
-; RV64I-NEXT: sb t1, 94(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 93(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 92(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 91(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 90(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: slli a0, t0, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a1, sp, 88
-; RV64I-NEXT: sub a0, a1, a0
-; RV64I-NEXT: lbu a1, 9(a0)
-; RV64I-NEXT: lbu a3, 8(a0)
-; RV64I-NEXT: lbu a4, 10(a0)
-; RV64I-NEXT: lbu a5, 11(a0)
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: sb zero, 39(sp)
+; RV64I-NEXT: sb zero, 38(sp)
+; RV64I-NEXT: sb zero, 37(sp)
+; RV64I-NEXT: sb zero, 36(sp)
+; RV64I-NEXT: sb zero, 35(sp)
+; RV64I-NEXT: sb zero, 34(sp)
+; RV64I-NEXT: sb zero, 33(sp)
+; RV64I-NEXT: sb zero, 32(sp)
+; RV64I-NEXT: sb zero, 31(sp)
+; RV64I-NEXT: sb zero, 30(sp)
+; RV64I-NEXT: sb zero, 29(sp)
+; RV64I-NEXT: sb zero, 28(sp)
+; RV64I-NEXT: sb zero, 27(sp)
+; RV64I-NEXT: sb zero, 26(sp)
+; RV64I-NEXT: sb zero, 25(sp)
+; RV64I-NEXT: sb zero, 24(sp)
+; RV64I-NEXT: sb zero, 23(sp)
+; RV64I-NEXT: sb zero, 22(sp)
+; RV64I-NEXT: sb zero, 21(sp)
+; RV64I-NEXT: sb zero, 20(sp)
+; RV64I-NEXT: sb zero, 19(sp)
+; RV64I-NEXT: sb zero, 18(sp)
+; RV64I-NEXT: sb zero, 17(sp)
+; RV64I-NEXT: sb zero, 16(sp)
+; RV64I-NEXT: sb zero, 15(sp)
+; RV64I-NEXT: sb zero, 14(sp)
+; RV64I-NEXT: sb zero, 13(sp)
+; RV64I-NEXT: sb zero, 12(sp)
+; RV64I-NEXT: sb zero, 11(sp)
+; RV64I-NEXT: sb zero, 10(sp)
+; RV64I-NEXT: sb zero, 9(sp)
+; RV64I-NEXT: sb zero, 8(sp)
+; RV64I-NEXT: sb s6, 67(sp)
+; RV64I-NEXT: sb s5, 66(sp)
+; RV64I-NEXT: sb s4, 65(sp)
+; RV64I-NEXT: sb s3, 64(sp)
+; RV64I-NEXT: sb s2, 59(sp)
+; RV64I-NEXT: sb s1, 58(sp)
+; RV64I-NEXT: sb t6, 57(sp)
+; RV64I-NEXT: sb t5, 56(sp)
+; RV64I-NEXT: sb t4, 51(sp)
+; RV64I-NEXT: sb t3, 50(sp)
+; RV64I-NEXT: sb t1, 49(sp)
+; RV64I-NEXT: sb t0, 48(sp)
+; RV64I-NEXT: sb a7, 43(sp)
+; RV64I-NEXT: sb a6, 42(sp)
+; RV64I-NEXT: sb a5, 41(sp)
+; RV64I-NEXT: sb a4, 40(sp)
+; RV64I-NEXT: srli a1, s7, 56
+; RV64I-NEXT: sb a1, 71(sp)
+; RV64I-NEXT: srli a1, s7, 48
+; RV64I-NEXT: sb a1, 70(sp)
+; RV64I-NEXT: srli a1, s7, 40
+; RV64I-NEXT: sb a1, 69(sp)
+; RV64I-NEXT: srli a1, s7, 32
+; RV64I-NEXT: sb a1, 68(sp)
+; RV64I-NEXT: srli a1, s0, 56
+; RV64I-NEXT: sb a1, 63(sp)
+; RV64I-NEXT: srli a1, s0, 48
+; RV64I-NEXT: sb a1, 62(sp)
+; RV64I-NEXT: srli a1, s0, 40
+; RV64I-NEXT: sb a1, 61(sp)
+; RV64I-NEXT: srli s0, s0, 32
+; RV64I-NEXT: sb s0, 60(sp)
+; RV64I-NEXT: srli a1, t2, 56
+; RV64I-NEXT: sb a1, 55(sp)
+; RV64I-NEXT: srli a1, t2, 48
+; RV64I-NEXT: sb a1, 54(sp)
+; RV64I-NEXT: srli a1, t2, 40
+; RV64I-NEXT: sb a1, 53(sp)
+; RV64I-NEXT: srli a1, t2, 32
+; RV64I-NEXT: sb a1, 52(sp)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 47(sp)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 46(sp)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 45(sp)
+; RV64I-NEXT: srli a3, a3, 32
+; RV64I-NEXT: sb a3, 44(sp)
+; RV64I-NEXT: slli a1, a0, 56
+; RV64I-NEXT: srli a1, a1, 59
+; RV64I-NEXT: addi a3, sp, 40
+; RV64I-NEXT: sub a3, a3, a1
+; RV64I-NEXT: lbu a1, 9(a3)
+; RV64I-NEXT: lbu a4, 8(a3)
+; RV64I-NEXT: lbu a5, 11(a3)
+; RV64I-NEXT: lbu a6, 10(a3)
; RV64I-NEXT: slli a1, a1, 8
-; RV64I-NEXT: or a1, a1, a3
+; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: lbu a3, 13(a0)
-; RV64I-NEXT: lbu a4, 12(a0)
-; RV64I-NEXT: lbu a5, 14(a0)
-; RV64I-NEXT: lbu a6, 15(a0)
-; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: or a3, a3, a4
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a3, a3, a1
-; RV64I-NEXT: andi a1, t0, 7
-; RV64I-NEXT: lbu a4, 1(a0)
-; RV64I-NEXT: lbu a5, 0(a0)
-; RV64I-NEXT: lbu a6, 2(a0)
-; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu a4, 13(a3)
+; RV64I-NEXT: lbu a5, 12(a3)
+; RV64I-NEXT: lbu a6, 15(a3)
+; RV64I-NEXT: lbu a7, 14(a3)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
-; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 5(a0)
-; RV64I-NEXT: lbu a6, 4(a0)
-; RV64I-NEXT: lbu a7, 6(a0)
-; RV64I-NEXT: lbu t0, 7(a0)
-; RV64I-NEXT: slli a5, a5, 8
-; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a6, t0, a7
-; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
+; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 25(a0)
-; RV64I-NEXT: lbu a6, 24(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu t0, 27(a0)
-; RV64I-NEXT: slli a5, a5, 8
-; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a6, t0, a7
-; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 29(a0)
-; RV64I-NEXT: lbu a7, 28(a0)
-; RV64I-NEXT: lbu t0, 30(a0)
-; RV64I-NEXT: lbu t1, 31(a0)
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a5, a4, a1
+; RV64I-NEXT: andi a4, a0, 7
+; RV64I-NEXT: sll a0, a5, a4
+; RV64I-NEXT: lbu a1, 1(a3)
+; RV64I-NEXT: lbu a6, 0(a3)
+; RV64I-NEXT: lbu a7, 3(a3)
+; RV64I-NEXT: lbu t0, 2(a3)
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a6, a7, t0
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: lbu a6, 5(a3)
+; RV64I-NEXT: lbu a7, 4(a3)
+; RV64I-NEXT: lbu t0, 7(a3)
+; RV64I-NEXT: lbu t1, 6(a3)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a7, t0, t1
+; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: slli a6, a6, 32
-; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 17(a0)
-; RV64I-NEXT: lbu a7, 16(a0)
-; RV64I-NEXT: lbu t0, 18(a0)
-; RV64I-NEXT: lbu t1, 19(a0)
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: lbu a7, 21(a0)
-; RV64I-NEXT: or t0, t1, t0
-; RV64I-NEXT: or a6, t0, a6
-; RV64I-NEXT: lbu t0, 20(a0)
+; RV64I-NEXT: or a6, a6, a1
+; RV64I-NEXT: srli a1, a6, 1
+; RV64I-NEXT: xori t0, a4, 63
+; RV64I-NEXT: srl a1, a1, t0
+; RV64I-NEXT: or a1, a0, a1
+; RV64I-NEXT: lbu a7, 25(a3)
+; RV64I-NEXT: lbu t1, 24(a3)
+; RV64I-NEXT: lbu t2, 27(a3)
+; RV64I-NEXT: lbu t3, 26(a3)
; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: lbu t1, 22(a0)
-; RV64I-NEXT: lbu a0, 23(a0)
-; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: srli t0, a4, 1
+; RV64I-NEXT: or a7, a7, t1
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t1, t2, t3
; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or t1, a0, t1
-; RV64I-NEXT: xori t2, a1, 63
-; RV64I-NEXT: srl a0, t0, t2
; RV64I-NEXT: or a7, t1, a7
-; RV64I-NEXT: slli a7, a7, 32
-; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: srli a7, a6, 1
-; RV64I-NEXT: srl a7, a7, t2
-; RV64I-NEXT: srli t0, a3, 1
-; RV64I-NEXT: not t1, a1
-; RV64I-NEXT: srl t0, t0, t1
-; RV64I-NEXT: sll a3, a3, a1
-; RV64I-NEXT: sll a5, a5, a1
-; RV64I-NEXT: sll a6, a6, a1
-; RV64I-NEXT: sll a1, a4, a1
-; RV64I-NEXT: srli a4, a6, 56
-; RV64I-NEXT: sb a4, 23(a2)
-; RV64I-NEXT: srli a4, a6, 48
-; RV64I-NEXT: sb a4, 22(a2)
-; RV64I-NEXT: srli a4, a6, 40
-; RV64I-NEXT: sb a4, 21(a2)
-; RV64I-NEXT: srli a4, a6, 32
-; RV64I-NEXT: sb a4, 20(a2)
-; RV64I-NEXT: srli a4, a6, 24
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: srli a4, a6, 16
-; RV64I-NEXT: sb a4, 18(a2)
-; RV64I-NEXT: or a4, a6, t0
-; RV64I-NEXT: srli a6, a6, 8
+; RV64I-NEXT: lbu t1, 29(a3)
+; RV64I-NEXT: lbu t2, 28(a3)
+; RV64I-NEXT: lbu t3, 31(a3)
+; RV64I-NEXT: lbu t4, 30(a3)
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or t1, t1, t2
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: or t2, t3, t4
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a7, t1, a7
+; RV64I-NEXT: sll a7, a7, a4
+; RV64I-NEXT: lbu t1, 17(a3)
+; RV64I-NEXT: lbu t2, 16(a3)
+; RV64I-NEXT: lbu t3, 19(a3)
+; RV64I-NEXT: lbu t4, 18(a3)
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or t1, t1, t2
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: or t2, t3, t4
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: lbu t2, 21(a3)
+; RV64I-NEXT: lbu t3, 20(a3)
+; RV64I-NEXT: lbu t4, 23(a3)
+; RV64I-NEXT: lbu a3, 22(a3)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t2, t2, t3
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a3, t4, a3
+; RV64I-NEXT: slli a3, a3, 16
+; RV64I-NEXT: or a3, a3, t2
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: or t1, a3, t1
+; RV64I-NEXT: srli a3, t1, 1
+; RV64I-NEXT: srl a3, a3, t0
+; RV64I-NEXT: or a3, a7, a3
+; RV64I-NEXT: sll t0, t1, a4
+; RV64I-NEXT: srli a5, a5, 1
+; RV64I-NEXT: not t1, a4
+; RV64I-NEXT: srl a5, a5, t1
+; RV64I-NEXT: or a5, t0, a5
+; RV64I-NEXT: sll a4, a6, a4
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: srli a6, t0, 56
+; RV64I-NEXT: sb a6, 23(a2)
+; RV64I-NEXT: srli a6, t0, 48
+; RV64I-NEXT: sb a6, 22(a2)
+; RV64I-NEXT: srli a6, t0, 40
+; RV64I-NEXT: sb a6, 21(a2)
+; RV64I-NEXT: srli a6, t0, 32
+; RV64I-NEXT: sb a6, 20(a2)
+; RV64I-NEXT: srli a6, t0, 24
+; RV64I-NEXT: sb a6, 19(a2)
+; RV64I-NEXT: srli a6, t0, 16
+; RV64I-NEXT: sb a6, 18(a2)
+; RV64I-NEXT: srli a6, t0, 8
; RV64I-NEXT: sb a6, 17(a2)
-; RV64I-NEXT: srli a6, a5, 56
+; RV64I-NEXT: srli a6, a7, 56
; RV64I-NEXT: sb a6, 31(a2)
-; RV64I-NEXT: srli a6, a5, 48
+; RV64I-NEXT: srli a6, a7, 48
; RV64I-NEXT: sb a6, 30(a2)
-; RV64I-NEXT: srli a6, a5, 40
+; RV64I-NEXT: srli a6, a7, 40
; RV64I-NEXT: sb a6, 29(a2)
-; RV64I-NEXT: srli a6, a5, 32
+; RV64I-NEXT: srli a6, a7, 32
; RV64I-NEXT: sb a6, 28(a2)
-; RV64I-NEXT: srli a6, a5, 24
+; RV64I-NEXT: srli a6, a7, 24
; RV64I-NEXT: sb a6, 27(a2)
-; RV64I-NEXT: srli a6, a5, 16
+; RV64I-NEXT: srli a6, a7, 16
; RV64I-NEXT: sb a6, 26(a2)
-; RV64I-NEXT: or a6, a5, a7
-; RV64I-NEXT: srli a5, a5, 8
-; RV64I-NEXT: sb a5, 25(a2)
-; RV64I-NEXT: srli a5, a1, 56
-; RV64I-NEXT: sb a5, 7(a2)
-; RV64I-NEXT: srli a5, a1, 48
-; RV64I-NEXT: sb a5, 6(a2)
-; RV64I-NEXT: srli a5, a1, 40
-; RV64I-NEXT: sb a5, 5(a2)
-; RV64I-NEXT: srli a5, a1, 32
-; RV64I-NEXT: sb a5, 4(a2)
-; RV64I-NEXT: srli a5, a1, 24
-; RV64I-NEXT: sb a5, 3(a2)
-; RV64I-NEXT: srli a5, a1, 16
-; RV64I-NEXT: sb a5, 2(a2)
-; RV64I-NEXT: sb a1, 0(a2)
-; RV64I-NEXT: srli a1, a1, 8
-; RV64I-NEXT: sb a1, 1(a2)
-; RV64I-NEXT: srli a1, a3, 56
-; RV64I-NEXT: sb a1, 15(a2)
-; RV64I-NEXT: srli a1, a3, 48
-; RV64I-NEXT: sb a1, 14(a2)
-; RV64I-NEXT: srli a1, a3, 40
-; RV64I-NEXT: sb a1, 13(a2)
-; RV64I-NEXT: srli a1, a3, 32
-; RV64I-NEXT: sb a1, 12(a2)
-; RV64I-NEXT: srli a1, a3, 24
-; RV64I-NEXT: sb a1, 11(a2)
-; RV64I-NEXT: srli a1, a3, 16
-; RV64I-NEXT: sb a1, 10(a2)
-; RV64I-NEXT: or a0, a3, a0
-; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 9(a2)
-; RV64I-NEXT: sb a4, 16(a2)
-; RV64I-NEXT: sb a6, 24(a2)
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: srli a6, a7, 8
+; RV64I-NEXT: sb a6, 25(a2)
+; RV64I-NEXT: srli a6, a4, 56
+; RV64I-NEXT: sb a6, 7(a2)
+; RV64I-NEXT: srli a6, a4, 48
+; RV64I-NEXT: sb a6, 6(a2)
+; RV64I-NEXT: srli a6, a4, 40
+; RV64I-NEXT: sb a6, 5(a2)
+; RV64I-NEXT: srli a6, a4, 32
+; RV64I-NEXT: sb a6, 4(a2)
+; RV64I-NEXT: srli a6, a4, 24
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: srli a6, a4, 16
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: srli a4, a0, 56
+; RV64I-NEXT: sb a4, 15(a2)
+; RV64I-NEXT: srli a4, a0, 48
+; RV64I-NEXT: sb a4, 14(a2)
+; RV64I-NEXT: srli a4, a0, 40
+; RV64I-NEXT: sb a4, 13(a2)
+; RV64I-NEXT: srli a4, a0, 32
+; RV64I-NEXT: sb a4, 12(a2)
+; RV64I-NEXT: srli a4, a0, 24
+; RV64I-NEXT: sb a4, 11(a2)
+; RV64I-NEXT: srli a4, a0, 16
+; RV64I-NEXT: sb a4, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: sb a5, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_32bytes:
@@ -2479,14 +2592,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu s9, 21(a0)
; RV32I-NEXT: lbu s11, 0(a1)
; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu ra, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: or s10, s10, s11
; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: slli ra, ra, 8
+; RV32I-NEXT: or a1, ra, a1
; RV32I-NEXT: lbu ra, 23(a0)
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or t0, a1, s10
; RV32I-NEXT: lbu s10, 24(a0)
; RV32I-NEXT: lbu a7, 25(a0)
@@ -2572,105 +2685,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sub a4, a4, a0
; RV32I-NEXT: lbu a0, 5(a4)
; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
+; RV32I-NEXT: lbu a3, 7(a4)
+; RV32I-NEXT: lbu a5, 6(a4)
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
; RV32I-NEXT: or t5, a3, a0
; RV32I-NEXT: andi a1, t0, 7
; RV32I-NEXT: lbu a0, 1(a4)
; RV32I-NEXT: lbu a3, 0(a4)
-; RV32I-NEXT: lbu a5, 2(a4)
-; RV32I-NEXT: lbu a6, 3(a4)
+; RV32I-NEXT: lbu a5, 3(a4)
+; RV32I-NEXT: lbu a6, 2(a4)
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a3
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a3, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a3, a5, a6
+; RV32I-NEXT: slli a3, a3, 16
; RV32I-NEXT: or a6, a3, a0
; RV32I-NEXT: srli a0, a6, 1
-; RV32I-NEXT: xori a7, a1, 31
-; RV32I-NEXT: srl a0, a0, a7
+; RV32I-NEXT: xori t0, a1, 31
+; RV32I-NEXT: srl a0, a0, t0
; RV32I-NEXT: lbu a3, 13(a4)
; RV32I-NEXT: lbu a5, 12(a4)
-; RV32I-NEXT: lbu t0, 14(a4)
-; RV32I-NEXT: lbu t1, 15(a4)
+; RV32I-NEXT: lbu a7, 15(a4)
+; RV32I-NEXT: lbu t1, 14(a4)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a5
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a5, t1, t0
-; RV32I-NEXT: or t0, a5, a3
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a5, a7, t1
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: or t1, a5, a3
; RV32I-NEXT: lbu a3, 9(a4)
; RV32I-NEXT: lbu a5, 8(a4)
-; RV32I-NEXT: lbu t1, 10(a4)
-; RV32I-NEXT: lbu t2, 11(a4)
+; RV32I-NEXT: lbu a7, 11(a4)
+; RV32I-NEXT: lbu t2, 10(a4)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, a5
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a5, t2, t1
-; RV32I-NEXT: or t1, a5, a3
-; RV32I-NEXT: srli a3, t1, 1
-; RV32I-NEXT: srl a5, a3, a7
-; RV32I-NEXT: srli t4, t5, 1
-; RV32I-NEXT: not t2, a1
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a5, a7, t2
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: or t2, a5, a3
+; RV32I-NEXT: srli a3, t2, 1
+; RV32I-NEXT: srl a5, a3, t0
+; RV32I-NEXT: srli a3, t5, 1
+; RV32I-NEXT: not t3, a1
+; RV32I-NEXT: srl a7, a3, t3
; RV32I-NEXT: lbu a3, 21(a4)
-; RV32I-NEXT: lbu t3, 20(a4)
-; RV32I-NEXT: lbu t6, 22(a4)
-; RV32I-NEXT: lbu s0, 23(a4)
+; RV32I-NEXT: lbu t4, 20(a4)
+; RV32I-NEXT: lbu t6, 23(a4)
+; RV32I-NEXT: lbu s0, 22(a4)
; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: slli t6, t6, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or t3, s0, t6
-; RV32I-NEXT: or t3, t3, a3
+; RV32I-NEXT: or a3, a3, t4
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: or t4, t6, s0
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: or t4, t4, a3
; RV32I-NEXT: lbu a3, 17(a4)
; RV32I-NEXT: lbu t6, 16(a4)
-; RV32I-NEXT: lbu s0, 18(a4)
-; RV32I-NEXT: lbu s1, 19(a4)
+; RV32I-NEXT: lbu s0, 19(a4)
+; RV32I-NEXT: lbu s1, 18(a4)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, t6
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: or s0, s0, s1
; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
; RV32I-NEXT: or s0, s0, a3
; RV32I-NEXT: lbu a3, 29(a4)
; RV32I-NEXT: lbu t6, 28(a4)
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu s2, 31(a4)
+; RV32I-NEXT: lbu s1, 31(a4)
+; RV32I-NEXT: lbu s2, 30(a4)
; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: or a3, a3, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or t6, s1, s2
; RV32I-NEXT: lbu s1, 25(a4)
; RV32I-NEXT: lbu s2, 24(a4)
-; RV32I-NEXT: srl t4, t4, t2
+; RV32I-NEXT: slli t6, t6, 16
; RV32I-NEXT: or t6, t6, a3
; RV32I-NEXT: slli s1, s1, 8
; RV32I-NEXT: or a3, s1, s2
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu a4, 27(a4)
+; RV32I-NEXT: lbu s1, 27(a4)
+; RV32I-NEXT: lbu a4, 26(a4)
; RV32I-NEXT: srli s2, s0, 1
-; RV32I-NEXT: srl s2, s2, a7
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: srli s1, t0, 1
-; RV32I-NEXT: srl s1, s1, t2
+; RV32I-NEXT: srl s2, s2, t0
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a4, s1, a4
+; RV32I-NEXT: srli s1, t1, 1
+; RV32I-NEXT: srl s1, s1, t3
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or a4, a4, a3
; RV32I-NEXT: srli a3, a4, 1
-; RV32I-NEXT: srl a7, a3, a7
-; RV32I-NEXT: srli a3, t3, 1
-; RV32I-NEXT: srl t2, a3, t2
+; RV32I-NEXT: srl t0, a3, t0
+; RV32I-NEXT: srli a3, t4, 1
+; RV32I-NEXT: srl t3, a3, t3
; RV32I-NEXT: sll a3, t5, a1
-; RV32I-NEXT: sll t0, t0, a1
; RV32I-NEXT: sll t1, t1, a1
-; RV32I-NEXT: sll t3, t3, a1
+; RV32I-NEXT: sll t2, t2, a1
+; RV32I-NEXT: sll t4, t4, a1
; RV32I-NEXT: sll t5, s0, a1
; RV32I-NEXT: sll t6, t6, a1
; RV32I-NEXT: sll a4, a4, a1
@@ -2679,48 +2792,48 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb a6, 27(a2)
; RV32I-NEXT: srli a6, a4, 16
; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: or a6, a4, t2
+; RV32I-NEXT: or a6, a4, t3
; RV32I-NEXT: srli a4, a4, 8
; RV32I-NEXT: sb a4, 25(a2)
; RV32I-NEXT: srli a4, t6, 24
; RV32I-NEXT: sb a4, 31(a2)
; RV32I-NEXT: srli a4, t6, 16
; RV32I-NEXT: sb a4, 30(a2)
-; RV32I-NEXT: or a4, t6, a7
-; RV32I-NEXT: srli a7, t6, 8
-; RV32I-NEXT: sb a7, 29(a2)
-; RV32I-NEXT: srli a7, t5, 24
-; RV32I-NEXT: sb a7, 19(a2)
-; RV32I-NEXT: srli a7, t5, 16
-; RV32I-NEXT: sb a7, 18(a2)
-; RV32I-NEXT: or a7, t5, s1
-; RV32I-NEXT: srli t2, t5, 8
-; RV32I-NEXT: sb t2, 17(a2)
-; RV32I-NEXT: srli t2, t3, 24
-; RV32I-NEXT: sb t2, 23(a2)
-; RV32I-NEXT: srli t2, t3, 16
-; RV32I-NEXT: sb t2, 22(a2)
-; RV32I-NEXT: or t2, t3, s2
-; RV32I-NEXT: srli t3, t3, 8
-; RV32I-NEXT: sb t3, 21(a2)
-; RV32I-NEXT: srli t3, t1, 24
-; RV32I-NEXT: sb t3, 11(a2)
-; RV32I-NEXT: srli t3, t1, 16
-; RV32I-NEXT: sb t3, 10(a2)
-; RV32I-NEXT: or t3, t1, t4
+; RV32I-NEXT: or a4, t6, t0
+; RV32I-NEXT: srli t0, t6, 8
+; RV32I-NEXT: sb t0, 29(a2)
+; RV32I-NEXT: srli t0, t5, 24
+; RV32I-NEXT: sb t0, 19(a2)
+; RV32I-NEXT: srli t0, t5, 16
+; RV32I-NEXT: sb t0, 18(a2)
+; RV32I-NEXT: or t0, t5, s1
+; RV32I-NEXT: srli t3, t5, 8
+; RV32I-NEXT: sb t3, 17(a2)
+; RV32I-NEXT: srli t3, t4, 24
+; RV32I-NEXT: sb t3, 23(a2)
+; RV32I-NEXT: srli t3, t4, 16
+; RV32I-NEXT: sb t3, 22(a2)
+; RV32I-NEXT: or t3, t4, s2
+; RV32I-NEXT: srli t4, t4, 8
+; RV32I-NEXT: sb t4, 21(a2)
+; RV32I-NEXT: srli t4, t2, 24
+; RV32I-NEXT: sb t4, 11(a2)
+; RV32I-NEXT: srli t4, t2, 16
+; RV32I-NEXT: sb t4, 10(a2)
+; RV32I-NEXT: or a7, t2, a7
+; RV32I-NEXT: srli t2, t2, 8
+; RV32I-NEXT: sb t2, 9(a2)
+; RV32I-NEXT: srli t2, t1, 24
+; RV32I-NEXT: sb t2, 15(a2)
+; RV32I-NEXT: srli t2, t1, 16
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: or a5, t1, a5
; RV32I-NEXT: srli t1, t1, 8
-; RV32I-NEXT: sb t1, 9(a2)
-; RV32I-NEXT: srli t1, t0, 24
-; RV32I-NEXT: sb t1, 15(a2)
-; RV32I-NEXT: srli t1, t0, 16
-; RV32I-NEXT: sb t1, 14(a2)
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: srli t0, t0, 8
-; RV32I-NEXT: sb t0, 13(a2)
-; RV32I-NEXT: srli t0, a1, 24
-; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: srli t0, a1, 16
-; RV32I-NEXT: sb t0, 2(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: srli t1, a1, 24
+; RV32I-NEXT: sb t1, 3(a2)
+; RV32I-NEXT: srli t1, a1, 16
+; RV32I-NEXT: sb t1, 2(a2)
; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: srli a1, a1, 8
; RV32I-NEXT: sb a1, 1(a2)
@@ -2733,9 +2846,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb a3, 5(a2)
; RV32I-NEXT: sb a6, 24(a2)
; RV32I-NEXT: sb a4, 28(a2)
-; RV32I-NEXT: sb a7, 16(a2)
-; RV32I-NEXT: sb t2, 20(a2)
-; RV32I-NEXT: sb t3, 8(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb t3, 20(a2)
+; RV32I-NEXT: sb a7, 8(a2)
; RV32I-NEXT: sb a5, 12(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
@@ -2762,341 +2875,396 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 31(a0)
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a5, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu a6, 2(a0)
+; RV64I-NEXT: slli a3, a5, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli t0, a7, 8
+; RV64I-NEXT: or t0, t0, a6
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 4(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
; RV64I-NEXT: lbu t3, 6(a0)
-; RV64I-NEXT: lbu t4, 7(a0)
-; RV64I-NEXT: lbu t5, 8(a0)
-; RV64I-NEXT: lbu t6, 9(a0)
-; RV64I-NEXT: lbu s0, 10(a0)
-; RV64I-NEXT: lbu s1, 11(a0)
-; RV64I-NEXT: lbu s2, 12(a0)
-; RV64I-NEXT: lbu s3, 13(a0)
-; RV64I-NEXT: lbu s4, 14(a0)
-; RV64I-NEXT: lbu s5, 15(a0)
-; RV64I-NEXT: lbu s6, 16(a0)
-; RV64I-NEXT: lbu s7, 17(a0)
-; RV64I-NEXT: lbu s8, 18(a0)
-; RV64I-NEXT: lbu s9, 19(a0)
-; RV64I-NEXT: lbu a3, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
-; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: or a3, a3, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or a3, s11, a3
-; RV64I-NEXT: lbu s11, 4(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t1, t2, t3
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: slli t0, t0, 32
+; RV64I-NEXT: or a3, t0, a3
+; RV64I-NEXT: lbu t2, 9(a0)
+; RV64I-NEXT: lbu t1, 8(a0)
+; RV64I-NEXT: lbu t4, 11(a0)
+; RV64I-NEXT: lbu t3, 10(a0)
+; RV64I-NEXT: slli t0, t2, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t5, t4, 8
+; RV64I-NEXT: or t5, t5, t3
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or t0, t5, t0
+; RV64I-NEXT: lbu t5, 13(a0)
+; RV64I-NEXT: lbu t6, 12(a0)
+; RV64I-NEXT: lbu s0, 15(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: or t5, t5, t6
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: or s0, s0, s1
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t5, s0, t5
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: or t0, t5, t0
+; RV64I-NEXT: lbu s0, 17(a0)
+; RV64I-NEXT: lbu t6, 16(a0)
+; RV64I-NEXT: lbu s2, 19(a0)
+; RV64I-NEXT: lbu s1, 18(a0)
+; RV64I-NEXT: slli t5, s0, 8
+; RV64I-NEXT: or t5, t5, t6
+; RV64I-NEXT: slli s3, s2, 8
+; RV64I-NEXT: or s3, s3, s1
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, t5
+; RV64I-NEXT: lbu s3, 21(a0)
+; RV64I-NEXT: lbu s4, 20(a0)
+; RV64I-NEXT: lbu s5, 23(a0)
+; RV64I-NEXT: lbu s6, 22(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: or s3, s3, s4
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s4, s5, s6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s3, s4, s3
+; RV64I-NEXT: slli s3, s3, 32
+; RV64I-NEXT: or t5, s3, t5
+; RV64I-NEXT: lbu s4, 25(a0)
+; RV64I-NEXT: lbu s3, 24(a0)
+; RV64I-NEXT: lbu s6, 27(a0)
+; RV64I-NEXT: lbu s7, 26(a0)
+; RV64I-NEXT: slli s5, s4, 8
+; RV64I-NEXT: or s5, s5, s3
+; RV64I-NEXT: slli s8, s6, 8
+; RV64I-NEXT: or s8, s8, s7
+; RV64I-NEXT: slli s8, s8, 16
+; RV64I-NEXT: or s5, s8, s5
+; RV64I-NEXT: lbu s8, 29(a0)
+; RV64I-NEXT: lbu s9, 28(a0)
+; RV64I-NEXT: lbu s10, 31(a0)
+; RV64I-NEXT: lbu a0, 30(a0)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s8, s8, s9
; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s11
-; RV64I-NEXT: lbu s11, 20(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 21(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t2, a1, a3
-; RV64I-NEXT: lbu t0, 23(a0)
-; RV64I-NEXT: lbu a7, 24(a0)
-; RV64I-NEXT: lbu a6, 25(a0)
-; RV64I-NEXT: lbu a5, 26(a0)
-; RV64I-NEXT: lbu a1, 30(a0)
-; RV64I-NEXT: lbu a3, 29(a0)
-; RV64I-NEXT: lbu a4, 28(a0)
-; RV64I-NEXT: lbu a0, 27(a0)
-; RV64I-NEXT: sb a1, 86(sp)
-; RV64I-NEXT: sb a3, 85(sp)
-; RV64I-NEXT: sb a4, 84(sp)
-; RV64I-NEXT: sb a0, 83(sp)
-; RV64I-NEXT: sb a5, 82(sp)
-; RV64I-NEXT: sb a6, 81(sp)
-; RV64I-NEXT: sb a7, 80(sp)
-; RV64I-NEXT: sb t0, 79(sp)
-; RV64I-NEXT: sb s10, 78(sp)
-; RV64I-NEXT: sb ra, 77(sp)
-; RV64I-NEXT: sb s11, 76(sp)
-; RV64I-NEXT: sb s9, 75(sp)
-; RV64I-NEXT: sb s8, 74(sp)
-; RV64I-NEXT: sb s7, 73(sp)
-; RV64I-NEXT: sb s6, 72(sp)
-; RV64I-NEXT: sb s5, 71(sp)
-; RV64I-NEXT: sb s4, 70(sp)
-; RV64I-NEXT: sb s3, 69(sp)
-; RV64I-NEXT: sb s2, 68(sp)
-; RV64I-NEXT: sb s1, 67(sp)
-; RV64I-NEXT: sb s0, 66(sp)
-; RV64I-NEXT: sb t6, 65(sp)
-; RV64I-NEXT: sb t5, 64(sp)
-; RV64I-NEXT: sb t1, 87(sp)
-; RV64I-NEXT: slli t1, t1, 56
-; RV64I-NEXT: sb t4, 63(sp)
-; RV64I-NEXT: sb t3, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: srai a0, t1, 63
-; RV64I-NEXT: sb a0, 112(sp)
-; RV64I-NEXT: sb a0, 104(sp)
-; RV64I-NEXT: sb a0, 96(sp)
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: srli a1, a0, 56
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: srli a3, a0, 48
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: srli a4, a0, 40
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: srli a5, a0, 32
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: srli a6, a0, 24
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: srli a7, a0, 16
-; RV64I-NEXT: sb a7, 114(sp)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 113(sp)
-; RV64I-NEXT: sb a1, 111(sp)
-; RV64I-NEXT: sb a3, 110(sp)
-; RV64I-NEXT: sb a4, 109(sp)
-; RV64I-NEXT: sb a5, 108(sp)
-; RV64I-NEXT: sb a6, 107(sp)
-; RV64I-NEXT: sb a7, 106(sp)
-; RV64I-NEXT: sb a0, 105(sp)
-; RV64I-NEXT: sb a1, 103(sp)
-; RV64I-NEXT: sb a3, 102(sp)
-; RV64I-NEXT: sb a4, 101(sp)
-; RV64I-NEXT: sb a5, 100(sp)
-; RV64I-NEXT: sb a6, 99(sp)
-; RV64I-NEXT: sb a7, 98(sp)
-; RV64I-NEXT: sb a0, 97(sp)
-; RV64I-NEXT: sb a1, 95(sp)
-; RV64I-NEXT: sb a3, 94(sp)
-; RV64I-NEXT: sb a4, 93(sp)
-; RV64I-NEXT: sb a5, 92(sp)
-; RV64I-NEXT: sb a6, 91(sp)
-; RV64I-NEXT: sb a7, 90(sp)
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: slli a0, t2, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a1, sp, 56
-; RV64I-NEXT: add a1, a1, a0
-; RV64I-NEXT: lbu a0, 9(a1)
-; RV64I-NEXT: lbu a3, 8(a1)
-; RV64I-NEXT: lbu a4, 10(a1)
-; RV64I-NEXT: lbu a5, 11(a1)
+; RV64I-NEXT: or a0, s10, a0
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, s8
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or s5, a0, s5
+; RV64I-NEXT: lbu a0, 1(a1)
+; RV64I-NEXT: lbu s8, 0(a1)
+; RV64I-NEXT: lbu s9, 3(a1)
+; RV64I-NEXT: lbu s10, 2(a1)
; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a0, a0, s8
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: or s8, s9, s10
+; RV64I-NEXT: slli s8, s8, 16
+; RV64I-NEXT: or a0, s8, a0
+; RV64I-NEXT: lbu s8, 5(a1)
+; RV64I-NEXT: lbu s9, 4(a1)
+; RV64I-NEXT: lbu s10, 7(a1)
+; RV64I-NEXT: lbu a1, 6(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s8, s8, s9
+; RV64I-NEXT: slli s10, s10, 8
+; RV64I-NEXT: or a1, s10, a1
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, s8
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: sb s6, 35(sp)
+; RV64I-NEXT: sb s7, 34(sp)
+; RV64I-NEXT: sb s4, 33(sp)
+; RV64I-NEXT: sb s3, 32(sp)
+; RV64I-NEXT: sb s2, 27(sp)
+; RV64I-NEXT: sb s1, 26(sp)
+; RV64I-NEXT: sb s0, 25(sp)
+; RV64I-NEXT: sb t6, 24(sp)
+; RV64I-NEXT: sb t4, 19(sp)
+; RV64I-NEXT: sb t3, 18(sp)
+; RV64I-NEXT: sb t2, 17(sp)
+; RV64I-NEXT: sb t1, 16(sp)
+; RV64I-NEXT: sb a7, 11(sp)
+; RV64I-NEXT: sb a6, 10(sp)
+; RV64I-NEXT: sb a5, 9(sp)
+; RV64I-NEXT: sb a4, 8(sp)
+; RV64I-NEXT: srai a1, s5, 63
+; RV64I-NEXT: sb a1, 64(sp)
+; RV64I-NEXT: sb a1, 56(sp)
+; RV64I-NEXT: sb a1, 48(sp)
+; RV64I-NEXT: sb a1, 40(sp)
+; RV64I-NEXT: srli a4, s5, 56
+; RV64I-NEXT: sb a4, 39(sp)
+; RV64I-NEXT: srli a4, s5, 48
+; RV64I-NEXT: sb a4, 38(sp)
+; RV64I-NEXT: srli a4, s5, 40
+; RV64I-NEXT: sb a4, 37(sp)
+; RV64I-NEXT: srli a4, s5, 32
+; RV64I-NEXT: sb a4, 36(sp)
+; RV64I-NEXT: srli a4, t5, 56
+; RV64I-NEXT: sb a4, 31(sp)
+; RV64I-NEXT: srli a4, t5, 48
+; RV64I-NEXT: sb a4, 30(sp)
+; RV64I-NEXT: srli a4, t5, 40
+; RV64I-NEXT: sb a4, 29(sp)
+; RV64I-NEXT: srli a4, t5, 32
+; RV64I-NEXT: sb a4, 28(sp)
+; RV64I-NEXT: srli a4, t0, 56
+; RV64I-NEXT: sb a4, 23(sp)
+; RV64I-NEXT: srli a4, t0, 48
+; RV64I-NEXT: sb a4, 22(sp)
+; RV64I-NEXT: srli a4, t0, 40
+; RV64I-NEXT: sb a4, 21(sp)
+; RV64I-NEXT: srli a4, t0, 32
+; RV64I-NEXT: sb a4, 20(sp)
+; RV64I-NEXT: srli a4, a3, 56
+; RV64I-NEXT: sb a4, 15(sp)
+; RV64I-NEXT: srli a4, a3, 48
+; RV64I-NEXT: sb a4, 14(sp)
+; RV64I-NEXT: srli a4, a3, 40
+; RV64I-NEXT: sb a4, 13(sp)
+; RV64I-NEXT: srli a3, a3, 32
+; RV64I-NEXT: sb a3, 12(sp)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 71(sp)
+; RV64I-NEXT: srli a4, a1, 48
+; RV64I-NEXT: sb a4, 70(sp)
+; RV64I-NEXT: srli a5, a1, 40
+; RV64I-NEXT: sb a5, 69(sp)
+; RV64I-NEXT: srli a6, a1, 32
+; RV64I-NEXT: sb a6, 68(sp)
+; RV64I-NEXT: srli a7, a1, 24
+; RV64I-NEXT: sb a7, 67(sp)
+; RV64I-NEXT: srli t0, a1, 16
+; RV64I-NEXT: sb t0, 66(sp)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 65(sp)
+; RV64I-NEXT: sb a3, 63(sp)
+; RV64I-NEXT: sb a4, 62(sp)
+; RV64I-NEXT: sb a5, 61(sp)
+; RV64I-NEXT: sb a6, 60(sp)
+; RV64I-NEXT: sb a7, 59(sp)
+; RV64I-NEXT: sb t0, 58(sp)
+; RV64I-NEXT: sb a1, 57(sp)
+; RV64I-NEXT: sb a3, 55(sp)
+; RV64I-NEXT: sb a4, 54(sp)
+; RV64I-NEXT: sb a5, 53(sp)
+; RV64I-NEXT: sb a6, 52(sp)
+; RV64I-NEXT: sb a7, 51(sp)
+; RV64I-NEXT: sb t0, 50(sp)
+; RV64I-NEXT: sb a1, 49(sp)
+; RV64I-NEXT: sb a3, 47(sp)
+; RV64I-NEXT: sb a4, 46(sp)
+; RV64I-NEXT: sb a5, 45(sp)
+; RV64I-NEXT: sb a6, 44(sp)
+; RV64I-NEXT: sb a7, 43(sp)
+; RV64I-NEXT: sb t0, 42(sp)
+; RV64I-NEXT: sb a1, 41(sp)
+; RV64I-NEXT: slli a1, a0, 56
+; RV64I-NEXT: srli a1, a1, 59
+; RV64I-NEXT: addi a3, sp, 8
+; RV64I-NEXT: add a3, a3, a1
+; RV64I-NEXT: lbu a1, 9(a3)
+; RV64I-NEXT: lbu a4, 8(a3)
+; RV64I-NEXT: lbu a5, 11(a3)
+; RV64I-NEXT: lbu a6, 10(a3)
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a4, a5, a6
; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a0, a4, a0
-; RV64I-NEXT: lbu a3, 13(a1)
-; RV64I-NEXT: lbu a4, 12(a1)
-; RV64I-NEXT: lbu a5, 14(a1)
-; RV64I-NEXT: lbu a6, 15(a1)
-; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: or a1, a4, a1
+; RV64I-NEXT: lbu a4, 13(a3)
+; RV64I-NEXT: lbu a5, 12(a3)
+; RV64I-NEXT: lbu a6, 15(a3)
+; RV64I-NEXT: lbu a7, 14(a3)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a5, a6, a7
; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a4, a3, a0
-; RV64I-NEXT: andi a3, t2, 7
-; RV64I-NEXT: lbu a0, 17(a1)
-; RV64I-NEXT: lbu a5, 16(a1)
-; RV64I-NEXT: lbu a6, 18(a1)
-; RV64I-NEXT: lbu a7, 19(a1)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a5, a4, a1
+; RV64I-NEXT: andi a4, a0, 7
+; RV64I-NEXT: srl a0, a5, a4
+; RV64I-NEXT: lbu a1, 17(a3)
+; RV64I-NEXT: lbu a6, 16(a3)
+; RV64I-NEXT: lbu a7, 19(a3)
+; RV64I-NEXT: lbu t0, 18(a3)
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a6, a7, t0
; RV64I-NEXT: slli a6, a6, 16
-; RV64I-NEXT: slli a7, a7, 24
-; RV64I-NEXT: or a5, a7, a6
-; RV64I-NEXT: or a0, a5, a0
-; RV64I-NEXT: lbu a5, 21(a1)
-; RV64I-NEXT: lbu a6, 20(a1)
-; RV64I-NEXT: lbu a7, 22(a1)
-; RV64I-NEXT: lbu t0, 23(a1)
-; RV64I-NEXT: slli a5, a5, 8
-; RV64I-NEXT: or a5, a5, a6
-; RV64I-NEXT: slli a7, a7, 16
-; RV64I-NEXT: slli t0, t0, 24
-; RV64I-NEXT: or a6, t0, a7
-; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: slli a5, a5, 32
-; RV64I-NEXT: or a5, a5, a0
-; RV64I-NEXT: slli a0, a5, 1
-; RV64I-NEXT: not a6, a3
-; RV64I-NEXT: sll a0, a0, a6
-; RV64I-NEXT: lbu a6, 1(a1)
-; RV64I-NEXT: lbu a7, 0(a1)
-; RV64I-NEXT: lbu t0, 2(a1)
-; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: lbu a6, 21(a3)
+; RV64I-NEXT: lbu a7, 20(a3)
+; RV64I-NEXT: lbu t0, 23(a3)
+; RV64I-NEXT: lbu t1, 22(a3)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
-; RV64I-NEXT: slli t0, t0, 16
-; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: or a7, t1, t0
-; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 5(a1)
-; RV64I-NEXT: lbu t0, 4(a1)
-; RV64I-NEXT: lbu t1, 6(a1)
-; RV64I-NEXT: lbu t2, 7(a1)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a7, t0, t1
+; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 25(a1)
-; RV64I-NEXT: lbu t0, 24(a1)
-; RV64I-NEXT: lbu t1, 26(a1)
-; RV64I-NEXT: lbu t2, 27(a1)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: lbu t0, 29(a1)
-; RV64I-NEXT: lbu t1, 28(a1)
-; RV64I-NEXT: lbu t2, 30(a1)
-; RV64I-NEXT: lbu a1, 31(a1)
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a7, a6, a1
+; RV64I-NEXT: slli a1, a7, 1
+; RV64I-NEXT: not a6, a4
+; RV64I-NEXT: sll a1, a1, a6
+; RV64I-NEXT: or a1, a0, a1
+; RV64I-NEXT: lbu a6, 1(a3)
+; RV64I-NEXT: lbu t0, 0(a3)
+; RV64I-NEXT: lbu t1, 3(a3)
+; RV64I-NEXT: lbu t2, 2(a3)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, t0
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or t0, t1, t2
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: lbu t0, 5(a3)
+; RV64I-NEXT: lbu t1, 4(a3)
+; RV64I-NEXT: lbu t2, 7(a3)
+; RV64I-NEXT: lbu t3, 6(a3)
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t1, t2, t3
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: slli t0, t0, 32
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: srl a6, a6, a4
+; RV64I-NEXT: slli a5, a5, 1
+; RV64I-NEXT: xori t0, a4, 63
+; RV64I-NEXT: sll a5, a5, t0
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: srl a7, a7, a4
+; RV64I-NEXT: lbu t1, 25(a3)
+; RV64I-NEXT: lbu t2, 24(a3)
+; RV64I-NEXT: lbu t3, 27(a3)
+; RV64I-NEXT: lbu t4, 26(a3)
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: or t1, t1, t2
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: or t2, t3, t4
; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, t2
-; RV64I-NEXT: slli t1, a4, 1
-; RV64I-NEXT: or a1, a1, t0
-; RV64I-NEXT: xori t0, a3, 63
-; RV64I-NEXT: sll t1, t1, t0
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a7, a1, a7
-; RV64I-NEXT: slli a1, a7, 1
-; RV64I-NEXT: sll t0, a1, t0
-; RV64I-NEXT: srl a1, a4, a3
-; RV64I-NEXT: srl a4, a6, a3
-; RV64I-NEXT: srl a5, a5, a3
-; RV64I-NEXT: sra a3, a7, a3
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 21(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 19(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 18(a2)
-; RV64I-NEXT: or a6, a5, t0
-; RV64I-NEXT: sb a5, 16(a2)
-; RV64I-NEXT: srli a5, a5, 8
-; RV64I-NEXT: sb a5, 17(a2)
-; RV64I-NEXT: srli a5, a3, 56
-; RV64I-NEXT: sb a5, 31(a2)
-; RV64I-NEXT: srli a5, a3, 48
-; RV64I-NEXT: sb a5, 30(a2)
-; RV64I-NEXT: srli a5, a3, 40
-; RV64I-NEXT: sb a5, 29(a2)
-; RV64I-NEXT: srli a5, a3, 32
-; RV64I-NEXT: sb a5, 28(a2)
-; RV64I-NEXT: srli a5, a3, 24
-; RV64I-NEXT: sb a5, 27(a2)
-; RV64I-NEXT: srli a5, a3, 16
-; RV64I-NEXT: sb a5, 26(a2)
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: lbu t2, 29(a3)
+; RV64I-NEXT: lbu t3, 28(a3)
+; RV64I-NEXT: lbu t4, 31(a3)
+; RV64I-NEXT: lbu a3, 30(a3)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: or t2, t2, t3
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a3, t4, a3
+; RV64I-NEXT: slli a3, a3, 16
+; RV64I-NEXT: or a3, a3, t2
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: or a3, a3, t1
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll t0, t1, t0
+; RV64I-NEXT: or t0, a7, t0
+; RV64I-NEXT: sra a3, a3, a4
+; RV64I-NEXT: sb a7, 16(a2)
; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb a6, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a4, a7, 48
+; RV64I-NEXT: sb a4, 22(a2)
+; RV64I-NEXT: srli a4, a7, 40
+; RV64I-NEXT: sb a4, 21(a2)
+; RV64I-NEXT: srli a4, a7, 32
+; RV64I-NEXT: sb a4, 20(a2)
+; RV64I-NEXT: srli a4, a7, 24
+; RV64I-NEXT: sb a4, 19(a2)
+; RV64I-NEXT: srli a4, a7, 16
+; RV64I-NEXT: sb a4, 18(a2)
+; RV64I-NEXT: srli a4, a7, 8
+; RV64I-NEXT: sb a4, 17(a2)
+; RV64I-NEXT: srli a4, a3, 56
+; RV64I-NEXT: sb a4, 31(a2)
+; RV64I-NEXT: srli a4, a3, 48
+; RV64I-NEXT: sb a4, 30(a2)
+; RV64I-NEXT: srli a4, a3, 40
+; RV64I-NEXT: sb a4, 29(a2)
+; RV64I-NEXT: srli a4, a3, 32
+; RV64I-NEXT: sb a4, 28(a2)
+; RV64I-NEXT: srli a4, a3, 24
+; RV64I-NEXT: sb a4, 27(a2)
+; RV64I-NEXT: srli a4, a3, 16
+; RV64I-NEXT: sb a4, 26(a2)
; RV64I-NEXT: srli a3, a3, 8
; RV64I-NEXT: sb a3, 25(a2)
-; RV64I-NEXT: srli a3, a4, 48
+; RV64I-NEXT: srli a3, a6, 48
; RV64I-NEXT: sb a3, 6(a2)
-; RV64I-NEXT: srli a3, a4, 40
+; RV64I-NEXT: srli a3, a6, 40
; RV64I-NEXT: sb a3, 5(a2)
-; RV64I-NEXT: srli a3, a4, 32
+; RV64I-NEXT: srli a3, a6, 32
; RV64I-NEXT: sb a3, 4(a2)
-; RV64I-NEXT: srli a3, a4, 24
+; RV64I-NEXT: srli a3, a6, 24
; RV64I-NEXT: sb a3, 3(a2)
-; RV64I-NEXT: srli a3, a4, 16
+; RV64I-NEXT: srli a3, a6, 16
; RV64I-NEXT: sb a3, 2(a2)
-; RV64I-NEXT: or a3, a4, t1
-; RV64I-NEXT: sb a4, 0(a2)
-; RV64I-NEXT: srli a4, a4, 8
-; RV64I-NEXT: sb a4, 1(a2)
-; RV64I-NEXT: srli a4, a1, 48
-; RV64I-NEXT: sb a4, 14(a2)
-; RV64I-NEXT: srli a4, a1, 40
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: srli a4, a1, 32
-; RV64I-NEXT: sb a4, 12(a2)
-; RV64I-NEXT: srli a4, a1, 24
-; RV64I-NEXT: sb a4, 11(a2)
-; RV64I-NEXT: srli a4, a1, 16
-; RV64I-NEXT: sb a4, 10(a2)
-; RV64I-NEXT: or a0, a1, a0
-; RV64I-NEXT: sb a1, 8(a2)
-; RV64I-NEXT: srli a1, a1, 8
-; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: srli a1, a6, 56
-; RV64I-NEXT: sb a1, 23(a2)
-; RV64I-NEXT: srli a3, a3, 56
-; RV64I-NEXT: sb a3, 7(a2)
-; RV64I-NEXT: srli a0, a0, 56
-; RV64I-NEXT: sb a0, 15(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: srli a3, a6, 8
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: srli a3, a0, 48
+; RV64I-NEXT: sb a3, 14(a2)
+; RV64I-NEXT: srli a3, a0, 40
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: srli a3, a0, 32
+; RV64I-NEXT: sb a3, 12(a2)
+; RV64I-NEXT: srli a3, a0, 24
+; RV64I-NEXT: sb a3, 11(a2)
+; RV64I-NEXT: srli a3, a0, 16
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: srli a0, t0, 56
+; RV64I-NEXT: sb a0, 23(a2)
+; RV64I-NEXT: srli a5, a5, 56
+; RV64I-NEXT: sb a5, 7(a2)
+; RV64I-NEXT: srli a1, a1, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_32bytes:
@@ -3146,14 +3314,14 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu s10, 20(a0)
; RV32I-NEXT: lbu s11, 0(a1)
; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: lbu ra, 3(a1)
+; RV32I-NEXT: lbu a1, 2(a1)
; RV32I-NEXT: or a3, a3, s11
; RV32I-NEXT: lbu s11, 21(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: slli ra, ra, 8
+; RV32I-NEXT: or a1, ra, a1
; RV32I-NEXT: lbu ra, 22(a0)
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or t1, a1, a3
; RV32I-NEXT: lbu t0, 23(a0)
; RV32I-NEXT: lbu a7, 24(a0)
@@ -3180,7 +3348,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 44(sp)
; RV32I-NEXT: sb s5, 43(sp)
; RV32I-NEXT: sb t3, 59(sp)
-; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: sb s4, 42(sp)
; RV32I-NEXT: sb s3, 41(sp)
; RV32I-NEXT: sb s2, 40(sp)
@@ -3244,82 +3413,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: add a4, a4, a0
; RV32I-NEXT: lbu a0, 5(a4)
; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
+; RV32I-NEXT: lbu a3, 7(a4)
+; RV32I-NEXT: lbu a5, 6(a4)
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
; RV32I-NEXT: or t5, a3, a0
; RV32I-NEXT: andi a3, t1, 7
; RV32I-NEXT: lbu a0, 9(a4)
; RV32I-NEXT: lbu a1, 8(a4)
-; RV32I-NEXT: lbu a5, 10(a4)
-; RV32I-NEXT: lbu a6, 11(a4)
+; RV32I-NEXT: lbu a5, 11(a4)
+; RV32I-NEXT: lbu a6, 10(a4)
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a1, a6, a5
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a1, a5, a6
+; RV32I-NEXT: slli a1, a1, 16
; RV32I-NEXT: or a6, a1, a0
; RV32I-NEXT: slli a0, a6, 1
; RV32I-NEXT: not t1, a3
; RV32I-NEXT: sll a0, a0, t1
; RV32I-NEXT: lbu a1, 1(a4)
; RV32I-NEXT: lbu a5, 0(a4)
-; RV32I-NEXT: lbu a7, 2(a4)
-; RV32I-NEXT: lbu t0, 3(a4)
+; RV32I-NEXT: lbu a7, 3(a4)
+; RV32I-NEXT: lbu t0, 2(a4)
; RV32I-NEXT: slli a1, a1, 8
; RV32I-NEXT: or a1, a1, a5
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a5, a7, t0
+; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: or t0, a5, a1
; RV32I-NEXT: slli a1, t5, 1
; RV32I-NEXT: xori t2, a3, 31
; RV32I-NEXT: sll a1, a1, t2
; RV32I-NEXT: lbu a5, 13(a4)
; RV32I-NEXT: lbu a7, 12(a4)
-; RV32I-NEXT: lbu t3, 14(a4)
-; RV32I-NEXT: lbu t4, 15(a4)
+; RV32I-NEXT: lbu t3, 15(a4)
+; RV32I-NEXT: lbu t4, 14(a4)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a7, t3, t4
+; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: or t3, a7, a5
; RV32I-NEXT: lbu a5, 17(a4)
; RV32I-NEXT: lbu a7, 16(a4)
-; RV32I-NEXT: lbu t4, 18(a4)
-; RV32I-NEXT: lbu t6, 19(a4)
+; RV32I-NEXT: lbu t4, 19(a4)
+; RV32I-NEXT: lbu t6, 18(a4)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a7, t6, t4
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: or a7, t4, t6
+; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: or t4, a7, a5
; RV32I-NEXT: slli a5, t4, 1
; RV32I-NEXT: sll a7, a5, t1
; RV32I-NEXT: lbu a5, 21(a4)
; RV32I-NEXT: lbu t6, 20(a4)
-; RV32I-NEXT: lbu s0, 22(a4)
-; RV32I-NEXT: lbu s1, 23(a4)
+; RV32I-NEXT: lbu s0, 23(a4)
+; RV32I-NEXT: lbu s1, 22(a4)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, t6
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: or s0, s0, s1
; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
; RV32I-NEXT: or s0, s0, a5
; RV32I-NEXT: lbu a5, 25(a4)
; RV32I-NEXT: lbu t6, 24(a4)
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu s2, 27(a4)
+; RV32I-NEXT: lbu s1, 27(a4)
+; RV32I-NEXT: lbu s2, 26(a4)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or t6, s1, s2
+; RV32I-NEXT: slli t6, t6, 16
; RV32I-NEXT: or t6, t6, a5
; RV32I-NEXT: lbu a5, 29(a4)
; RV32I-NEXT: lbu s1, 28(a4)
@@ -3327,15 +3496,15 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sll t1, s2, t1
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, s1
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu a4, 31(a4)
+; RV32I-NEXT: lbu s1, 31(a4)
+; RV32I-NEXT: lbu a4, 30(a4)
; RV32I-NEXT: slli s2, t3, 1
; RV32I-NEXT: sll s2, s2, t2
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a4, s1, a4
; RV32I-NEXT: slli s1, s0, 1
; RV32I-NEXT: sll s1, s1, t2
+; RV32I-NEXT: slli a4, a4, 16
; RV32I-NEXT: or s3, a4, a5
; RV32I-NEXT: slli a4, s3, 1
; RV32I-NEXT: sll t2, a4, t2
diff --git a/llvm/test/CodeGen/SystemZ/int-abs-01.ll b/llvm/test/CodeGen/SystemZ/int-abs-01.ll
index 7bdf622ed67d1a..1dbc9e61735661 100644
--- a/llvm/test/CodeGen/SystemZ/int-abs-01.ll
+++ b/llvm/test/CodeGen/SystemZ/int-abs-01.ll
@@ -94,7 +94,11 @@ define i64 @f7(i64 %val) {
define i64 @f8(i64 %val) {
; CHECK-LABEL: f8:
; CHECK: # %bb.0:
-; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: sllg %r0, %r2, 32
+; CHECK-NEXT: srag %r2, %r0, 32
+; CHECK-NEXT: cgibhe %r0, 0, 0(%r14)
+; CHECK-NEXT: .LBB7_1:
+; CHECK-NEXT: lcgr %r2, %r2
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
@@ -108,7 +112,11 @@ define i64 @f8(i64 %val) {
define i64 @f9(i64 %val) {
; CHECK-LABEL: f9:
; CHECK: # %bb.0:
-; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: sllg %r0, %r2, 32
+; CHECK-NEXT: srag %r2, %r0, 32
+; CHECK-NEXT: cgibh %r0, 0, 0(%r14)
+; CHECK-NEXT: .LBB8_1:
+; CHECK-NEXT: lcgr %r2, %r2
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
@@ -122,7 +130,11 @@ define i64 @f9(i64 %val) {
define i64 @f10(i64 %val) {
; CHECK-LABEL: f10:
; CHECK: # %bb.0:
-; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: sllg %r0, %r2, 32
+; CHECK-NEXT: srag %r2, %r0, 32
+; CHECK-NEXT: cgibh %r0, 0, 0(%r14)
+; CHECK-NEXT: .LBB9_1:
+; CHECK-NEXT: lcgr %r2, %r2
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
@@ -136,7 +148,11 @@ define i64 @f10(i64 %val) {
define i64 @f11(i64 %val) {
; CHECK-LABEL: f11:
; CHECK: # %bb.0:
-; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: sllg %r0, %r2, 32
+; CHECK-NEXT: srag %r2, %r0, 32
+; CHECK-NEXT: cgibhe %r0, 0, 0(%r14)
+; CHECK-NEXT: .LBB10_1:
+; CHECK-NEXT: lcgr %r2, %r2
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
diff --git a/llvm/test/CodeGen/SystemZ/int-cmp-44.ll b/llvm/test/CodeGen/SystemZ/int-cmp-44.ll
index 559dcfe25d237f..31d72e9c4fc2d5 100644
--- a/llvm/test/CodeGen/SystemZ/int-cmp-44.ll
+++ b/llvm/test/CodeGen/SystemZ/int-cmp-44.ll
@@ -959,11 +959,12 @@ exit:
define i64 @f40(i64 %dummy, i64 %a, ptr %dest) {
; CHECK-LABEL: f40:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: ltgfr %r2, %r3
+; CHECK-NEXT: sllg %r0, %r3, 32
+; CHECK-NEXT: srag %r2, %r0, 32
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r2
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: cgibh %r0, 0, 0(%r14)
; CHECK-NEXT: .LBB39_1: # %store
; CHECK-NEXT: stg %r2, 0(%r4)
; CHECK-NEXT: br %r14
diff --git a/llvm/test/CodeGen/SystemZ/int-neg-02.ll b/llvm/test/CodeGen/SystemZ/int-neg-02.ll
index 7d62fe743a8b62..ae6aa9e99c96aa 100644
--- a/llvm/test/CodeGen/SystemZ/int-neg-02.ll
+++ b/llvm/test/CodeGen/SystemZ/int-neg-02.ll
@@ -102,7 +102,13 @@ define i64 @f7(i64 %val) {
define i64 @f8(i64 %val) {
; CHECK-LABEL: f8:
; CHECK: # %bb.0:
-; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: sllg %r1, %r2, 32
+; CHECK-NEXT: srag %r0, %r1, 32
+; CHECK-NEXT: cgijhe %r1, 0, .LBB7_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: lcgr %r0, %r0
+; CHECK-NEXT: .LBB7_2:
+; CHECK-NEXT: lcgr %r2, %r0
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
@@ -117,7 +123,13 @@ define i64 @f8(i64 %val) {
define i64 @f9(i64 %val) {
; CHECK-LABEL: f9:
; CHECK: # %bb.0:
-; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: sllg %r1, %r2, 32
+; CHECK-NEXT: srag %r0, %r1, 32
+; CHECK-NEXT: cgijh %r1, 0, .LBB8_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: lcgr %r0, %r0
+; CHECK-NEXT: .LBB8_2:
+; CHECK-NEXT: lcgr %r2, %r0
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
@@ -132,7 +144,13 @@ define i64 @f9(i64 %val) {
define i64 @f10(i64 %val) {
; CHECK-LABEL: f10:
; CHECK: # %bb.0:
-; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: sllg %r1, %r2, 32
+; CHECK-NEXT: srag %r0, %r1, 32
+; CHECK-NEXT: cgijh %r1, 0, .LBB9_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: lcgr %r0, %r0
+; CHECK-NEXT: .LBB9_2:
+; CHECK-NEXT: lcgr %r2, %r0
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
@@ -147,7 +165,13 @@ define i64 @f10(i64 %val) {
define i64 @f11(i64 %val) {
; CHECK-LABEL: f11:
; CHECK: # %bb.0:
-; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: sllg %r1, %r2, 32
+; CHECK-NEXT: srag %r0, %r1, 32
+; CHECK-NEXT: cgijhe %r1, 0, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: lcgr %r0, %r0
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: lcgr %r2, %r0
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
@@ -162,7 +186,11 @@ define i64 @f11(i64 %val) {
define i64 @f12(i64 %val) {
; CHECK-LABEL: f12:
; CHECK: # %bb.0:
-; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: sllg %r0, %r2, 32
+; CHECK-NEXT: srag %r2, %r0, 32
+; CHECK-NEXT: cgibl %r0, 0, 0(%r14)
+; CHECK-NEXT: .LBB11_1:
+; CHECK-NEXT: lcgr %r2, %r2
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
@@ -176,7 +204,11 @@ define i64 @f12(i64 %val) {
define i64 @f13(i64 %val) {
; CHECK-LABEL: f13:
; CHECK: # %bb.0:
-; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: sllg %r0, %r2, 32
+; CHECK-NEXT: srag %r2, %r0, 32
+; CHECK-NEXT: cgible %r0, 0, 0(%r14)
+; CHECK-NEXT: .LBB12_1:
+; CHECK-NEXT: lcgr %r2, %r2
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
@@ -190,7 +222,11 @@ define i64 @f13(i64 %val) {
define i64 @f14(i64 %val) {
; CHECK-LABEL: f14:
; CHECK: # %bb.0:
-; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: sllg %r0, %r2, 32
+; CHECK-NEXT: srag %r2, %r0, 32
+; CHECK-NEXT: cgible %r0, 0, 0(%r14)
+; CHECK-NEXT: .LBB13_1:
+; CHECK-NEXT: lcgr %r2, %r2
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
@@ -204,7 +240,11 @@ define i64 @f14(i64 %val) {
define i64 @f15(i64 %val) {
; CHECK-LABEL: f15:
; CHECK: # %bb.0:
-; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: sllg %r0, %r2, 32
+; CHECK-NEXT: srag %r2, %r0, 32
+; CHECK-NEXT: cgibl %r0, 0, 0(%r14)
+; CHECK-NEXT: .LBB14_1:
+; CHECK-NEXT: lcgr %r2, %r2
; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
index 249136af1c7374..9a8c5b7af964ed 100644
--- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
+++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
@@ -126,7 +126,7 @@ define void @fun2(<8 x i32> %src, ptr %p)
; CHECK-NEXT: vsl %v4, %v4, %v5
; CHECK-NEXT: vo %v3, %v3, %v4
; CHECK-NEXT: vlvgp %v4, %r0, %r0
-; CHECK-NEXT: vlgvf %r0, %v24, 0
+; CHECK-NEXT: vlgvf %r0, %v24, 1
; CHECK-NEXT: vn %v4, %v4, %v2
; CHECK-NEXT: vrepib %v5, 62
; CHECK-NEXT: vslb %v4, %v4, %v5
@@ -134,14 +134,15 @@ define void @fun2(<8 x i32> %src, ptr %p)
; CHECK-NEXT: vo %v4, %v3, %v4
; CHECK-NEXT: vo %v1, %v4, %v1
; CHECK-NEXT: vrepib %v4, 56
-; CHECK-NEXT: vrepib %v5, 58
+; CHECK-NEXT: vrepib %v5, 89
; CHECK-NEXT: vsrlb %v1, %v1, %v4
; CHECK-NEXT: vsteg %v1, 16(%r2), 1
; CHECK-NEXT: vrepib %v1, 120
-; CHECK-NEXT: vrepib %v4, 89
+; CHECK-NEXT: vrepib %v4, 58
; CHECK-NEXT: vsrlb %v1, %v3, %v1
; CHECK-NEXT: vlvgp %v3, %r0, %r0
-; CHECK-NEXT: vlgvf %r0, %v24, 1
+; CHECK-NEXT: vlgvf %r0, %v24, 0
+; CHECK-NEXT: vn %v3, %v3, %v2
; CHECK-NEXT: vslb %v3, %v3, %v4
; CHECK-NEXT: vsl %v3, %v3, %v4
; CHECK-NEXT: vlvgp %v4, %r0, %r0
@@ -149,7 +150,7 @@ define void @fun2(<8 x i32> %src, ptr %p)
; CHECK-NEXT: vn %v4, %v4, %v2
; CHECK-NEXT: vslb %v4, %v4, %v5
; CHECK-NEXT: vsl %v4, %v4, %v5
-; CHECK-NEXT: vo %v3, %v3, %v4
+; CHECK-NEXT: vo %v3, %v4, %v3
; CHECK-NEXT: vlvgp %v4, %r0, %r0
; CHECK-NEXT: vn %v2, %v4, %v2
; CHECK-NEXT: vrepib %v4, 27
@@ -183,9 +184,25 @@ define void @fun3(ptr %src, ptr %p)
; CHECK-NEXT: vrepib %v2, 32
; CHECK-NEXT: vslb %v0, %v0, %v2
; CHECK-NEXT: vo %v0, %v1, %v0
+; CHECK-NEXT: vrepib %v4, 31
+; CHECK-NEXT: larl %r1, .LCPI3_0
+; CHECK-NEXT: vsrlb %v5, %v0, %v4
+; CHECK-NEXT: vl %v1, 0(%r1), 3
+; CHECK-NEXT: vsrl %v5, %v5, %v4
+; CHECK-NEXT: vn %v3, %v0, %v1
+; CHECK-NEXT: vn %v1, %v5, %v1
+; CHECK-NEXT: vrepib %v5, 62
+; CHECK-NEXT: vsrlb %v0, %v0, %v5
+; CHECK-NEXT: vsrl %v0, %v0, %v5
+; CHECK-NEXT: vslb %v0, %v0, %v5
+; CHECK-NEXT: vsl %v0, %v0, %v5
+; CHECK-NEXT: vslb %v1, %v1, %v4
+; CHECK-NEXT: vsl %v1, %v1, %v4
+; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vsrlb %v1, %v0, %v2
+; CHECK-NEXT: vsteg %v1, 0(%r3), 1
+; CHECK-NEXT: vo %v0, %v0, %v3
; CHECK-NEXT: vstef %v0, 8(%r3), 3
-; CHECK-NEXT: vsrlb %v0, %v0, %v2
-; CHECK-NEXT: vsteg %v0, 0(%r3), 1
; CHECK-NEXT: br %r14
{
%tmp = load <3 x i31>, ptr %src
diff --git a/llvm/test/CodeGen/Thumb/shift-and.ll b/llvm/test/CodeGen/Thumb/shift-and.ll
index e5fee86343b0ed..ca383a733f2ea0 100644
--- a/llvm/test/CodeGen/Thumb/shift-and.ll
+++ b/llvm/test/CodeGen/Thumb/shift-and.ll
@@ -192,8 +192,9 @@ entry:
define i32 @test16(i32 %x) {
; CHECK-LABEL: test16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: lsls r0, r0, #28
-; CHECK-NEXT: lsrs r0, r0, #26
+; CHECK-NEXT: movs r1, #15
+; CHECK-NEXT: ands r1, r0
+; CHECK-NEXT: lsls r0, r1, #2
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %x, 15
@@ -204,8 +205,9 @@ entry:
define ptr @test17(ptr %p, i32 %x) {
; CHECK-LABEL: test17:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: lsls r1, r1, #28
-; CHECK-NEXT: lsrs r1, r1, #26
+; CHECK-NEXT: movs r2, #15
+; CHECK-NEXT: ands r2, r1
+; CHECK-NEXT: lsls r1, r2, #2
; CHECK-NEXT: adds r0, r0, r1
; CHECK-NEXT: bx lr
entry:
@@ -218,8 +220,9 @@ define ptr @test18(ptr %p, i32 %x) {
; CHECK-LABEL: test18:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: adds r1, r1, #1
-; CHECK-NEXT: lsls r1, r1, #28
-; CHECK-NEXT: lsrs r1, r1, #26
+; CHECK-NEXT: movs r2, #15
+; CHECK-NEXT: ands r2, r1
+; CHECK-NEXT: lsls r1, r2, #2
; CHECK-NEXT: adds r0, r0, r1
; CHECK-NEXT: bx lr
entry:
@@ -233,8 +236,9 @@ define ptr @test19(ptr %p, i32 %x) {
; CHECK-LABEL: test19:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: subs r1, r1, #1
-; CHECK-NEXT: lsls r1, r1, #28
-; CHECK-NEXT: lsrs r1, r1, #26
+; CHECK-NEXT: movs r2, #15
+; CHECK-NEXT: ands r2, r1
+; CHECK-NEXT: lsls r1, r2, #2
; CHECK-NEXT: adds r0, r0, r1
; CHECK-NEXT: bx lr
entry:
@@ -248,8 +252,9 @@ define ptr @test20(ptr %p, i32 %x) {
; CHECK-LABEL: test20:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: subs r1, r1, #1
-; CHECK-NEXT: lsls r1, r1, #28
-; CHECK-NEXT: lsrs r1, r1, #26
+; CHECK-NEXT: movs r2, #15
+; CHECK-NEXT: ands r2, r1
+; CHECK-NEXT: lsls r1, r2, #2
; CHECK-NEXT: adds r0, r0, r1
; CHECK-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
index 00eed6483cc120..a76d0cf26f0d01 100644
--- a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
@@ -37,9 +37,11 @@ define i1 @test_srem_even(i4 %X) nounwind {
; CHECK-NEXT: asrs r1, r1, #28
; CHECK-NEXT: movs r2, #3
; CHECK-NEXT: muls r2, r1, r2
-; CHECK-NEXT: lsrs r1, r2, #31
-; CHECK-NEXT: lsrs r2, r2, #4
-; CHECK-NEXT: adds r1, r2, r1
+; CHECK-NEXT: lsrs r1, r2, #4
+; CHECK-NEXT: movs r2, #8
+; CHECK-NEXT: ands r2, r1
+; CHECK-NEXT: lsrs r2, r2, #3
+; CHECK-NEXT: adds r1, r1, r2
; CHECK-NEXT: movs r2, #6
; CHECK-NEXT: muls r2, r1, r2
; CHECK-NEXT: subs r0, r0, r2
@@ -59,11 +61,14 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; CHECK: @ %bb.0:
; CHECK-NEXT: lsls r1, r0, #26
; CHECK-NEXT: asrs r1, r1, #26
-; CHECK-NEXT: lsrs r1, r1, #30
-; CHECK-NEXT: adds r1, r0, r1
-; CHECK-NEXT: movs r2, #60
+; CHECK-NEXT: lsrs r1, r1, #5
+; CHECK-NEXT: movs r2, #48
; CHECK-NEXT: ands r2, r1
-; CHECK-NEXT: subs r1, r0, r2
+; CHECK-NEXT: lsrs r1, r2, #4
+; CHECK-NEXT: adds r1, r0, r1
+; CHECK-NEXT: lsrs r1, r1, #2
+; CHECK-NEXT: lsls r1, r1, #2
+; CHECK-NEXT: subs r1, r0, r1
; CHECK-NEXT: movs r0, #63
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: subs r1, r0, #1
diff --git a/llvm/test/CodeGen/Thumb/umul_fix_sat.ll b/llvm/test/CodeGen/Thumb/umul_fix_sat.ll
index fa88024315211b..08c88eeec2d3c3 100644
--- a/llvm/test/CodeGen/Thumb/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/Thumb/umul_fix_sat.ll
@@ -138,8 +138,10 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, lr}
; ARM-NEXT: push {r4, lr}
-; ARM-NEXT: movs r2, #15
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: movs r1, #15
; ARM-NEXT: ands r2, r1
+; ARM-NEXT: ands r0, r1
; ARM-NEXT: lsls r0, r0, #28
; ARM-NEXT: movs r4, #0
; ARM-NEXT: mov r1, r4
@@ -252,8 +254,10 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, lr}
; ARM-NEXT: push {r4, lr}
-; ARM-NEXT: movs r2, #15
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: movs r1, #15
; ARM-NEXT: ands r2, r1
+; ARM-NEXT: ands r0, r1
; ARM-NEXT: lsls r0, r0, #28
; ARM-NEXT: movs r4, #0
; ARM-NEXT: mov r1, r4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
index a0e690212d5a43..9f724580220d42 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
@@ -41,9 +41,10 @@ define dso_local void @check_option(ptr noalias nocapture %A, ptr noalias nocapt
; DISABLED-NEXT: blt .LBB0_4
; DISABLED-NEXT: @ %bb.1: @ %vector.ph.preheader
; DISABLED-NEXT: adds r7, r3, #3
+; DISABLED-NEXT: mvn r6, #3
+; DISABLED-NEXT: lsrs r7, r7, #2
+; DISABLED-NEXT: add.w r7, r6, r7, lsl #2
; DISABLED-NEXT: movs r6, #1
-; DISABLED-NEXT: bic r7, r7, #3
-; DISABLED-NEXT: subs r7, #4
; DISABLED-NEXT: add.w r8, r6, r7, lsr #2
; DISABLED-NEXT: .LBB0_2: @ %vector.ph
; DISABLED-NEXT: @ =>This Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/bfx.ll b/llvm/test/CodeGen/Thumb2/bfx.ll
index 0191b81805fd1d..55e470c53c9fe8 100644
--- a/llvm/test/CodeGen/Thumb2/bfx.ll
+++ b/llvm/test/CodeGen/Thumb2/bfx.ll
@@ -4,7 +4,8 @@
define i32 @sbfx1(i32 %a) {
; CHECK-LABEL: sbfx1:
; CHECK: @ %bb.0:
-; CHECK-NEXT: sbfx r0, r0, #7, #11
+; CHECK-NEXT: lsls r0, r0, #14
+; CHECK-NEXT: sbfx r0, r0, #21, #11
; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = trunc i32 %t1 to i11
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 1c95d28b5eed1b..e8ca44f2bbb25b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1012,9 +1012,9 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: cmp r6, #1
; CHECK-NEXT: it gt
; CHECK-NEXT: asrgt r1, r7, #3
-; CHECK-NEXT: add.w r7, r5, r4, lsl #1
+; CHECK-NEXT: subs r7, r4, #1
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
-; CHECK-NEXT: subs r1, r7, #2
+; CHECK-NEXT: add.w r1, r5, r7, lsl #1
; CHECK-NEXT: rsbs r7, r4, #0
; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: add.w r7, r3, #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 808626d9a0aebe..5633a8d66b1c06 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1007,12 +1007,12 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: cmp r7, #1
; CHECK-NEXT: it gt
; CHECK-NEXT: asrgt r5, r3, #3
-; CHECK-NEXT: add.w r3, r4, r6, lsl #2
-; CHECK-NEXT: sub.w r9, r3, #4
+; CHECK-NEXT: subs r3, r6, #1
+; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: add.w r9, r4, r3, lsl #2
; CHECK-NEXT: rsbs r3, r6, #0
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: add.w r3, r10, #32
-; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 93cab25c2cb72e..e0004100927d46 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -294,33 +294,35 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture reado
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB8_1: @ %vector.ph.preheader
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: bic r12, r2, #3
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: lsr.w r12, r2, #2
+; CHECK-NEXT: mvn r3, #3
+; CHECK-NEXT: add.w r12, r3, r12, lsl #2
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: sub.w lr, r12, #4
-; CHECK-NEXT: add.w r4, r3, lr, lsr #2
+; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: adr r3, .LCPI8_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: .LBB8_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB8_3 Depth 2
-; CHECK-NEXT: dls lr, r4
-; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: dls lr, r12
+; CHECK-NEXT: bic r0, r2, #3
+; CHECK-NEXT: mov r3, r1
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: .LBB8_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB8_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q2, [q1, #16]!
-; CHECK-NEXT: vstrb.8 q2, [r0], #16
+; CHECK-NEXT: vstrb.8 q2, [r3], #16
; CHECK-NEXT: le lr, .LBB8_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB8_2 Depth=1
-; CHECK-NEXT: cmp r12, r2
+; CHECK-NEXT: cmp r0, r2
; CHECK-NEXT: bne .LBB8_2
; CHECK-NEXT: @ %bb.5:
-; CHECK-NEXT: pop.w {r4, lr}
+; CHECK-NEXT: pop.w {r7, lr}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
@@ -364,28 +366,30 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB9_1: @ %vector.ph.preheader
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: bic r12, r2, #3
+; CHECK-NEXT: lsr.w r12, r2, #2
+; CHECK-NEXT: mvn r3, #3
+; CHECK-NEXT: add.w r12, r3, r12, lsl #2
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: sub.w lr, r12, #4
-; CHECK-NEXT: adr r4, .LCPI9_1
-; CHECK-NEXT: adr r5, .LCPI9_2
-; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: add.w r3, r3, lr, lsr #2
; CHECK-NEXT: adr.w lr, .LCPI9_0
-; CHECK-NEXT: vldrw.u32 q0, [r5]
+; CHECK-NEXT: adr r4, .LCPI9_2
+; CHECK-NEXT: add.w r12, r3, r12, lsr #2
+; CHECK-NEXT: adr r3, .LCPI9_1
+; CHECK-NEXT: vldrw.u32 q0, [r4]
+; CHECK-NEXT: vldrw.u32 q1, [r3]
; CHECK-NEXT: vldrw.u32 q2, [lr]
-; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: .LBB9_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB9_3 Depth 2
-; CHECK-NEXT: dls lr, r3
-; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: dls lr, r12
+; CHECK-NEXT: bic r0, r2, #3
+; CHECK-NEXT: mov r3, r1
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vmov q4, q0
; CHECK-NEXT: vmov q5, q2
@@ -397,15 +401,15 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read
; CHECK-NEXT: vadd.i32 q6, q7, q6
; CHECK-NEXT: vldrw.u32 q7, [q4, #48]!
; CHECK-NEXT: vadd.i32 q6, q6, q7
-; CHECK-NEXT: vstrb.8 q6, [r0], #16
+; CHECK-NEXT: vstrb.8 q6, [r3], #16
; CHECK-NEXT: le lr, .LBB9_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1
-; CHECK-NEXT: cmp r12, r2
+; CHECK-NEXT: cmp r0, r2
; CHECK-NEXT: bne .LBB9_2
; CHECK-NEXT: @ %bb.5:
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: pop.w {r4, r5, r7, lr}
+; CHECK-NEXT: pop.w {r4, lr}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
@@ -468,33 +472,35 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_large(ptr noalias nocapture readon
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB10_1: @ %vector.ph.preheader
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: bic r12, r2, #3
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: lsr.w r12, r2, #2
+; CHECK-NEXT: mvn r3, #3
+; CHECK-NEXT: add.w r12, r3, r12, lsl #2
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: sub.w lr, r12, #4
-; CHECK-NEXT: add.w r4, r3, lr, lsr #2
+; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: adr r3, .LCPI10_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: .LBB10_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB10_3 Depth 2
-; CHECK-NEXT: dls lr, r4
-; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: dls lr, r12
+; CHECK-NEXT: bic r0, r2, #3
+; CHECK-NEXT: mov r3, r1
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: .LBB10_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB10_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q2, [q1, #508]!
-; CHECK-NEXT: vstrb.8 q2, [r0], #16
+; CHECK-NEXT: vstrb.8 q2, [r3], #16
; CHECK-NEXT: le lr, .LBB10_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_2 Depth=1
-; CHECK-NEXT: cmp r12, r2
+; CHECK-NEXT: cmp r0, r2
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: @ %bb.5:
-; CHECK-NEXT: pop.w {r4, lr}
+; CHECK-NEXT: pop.w {r7, lr}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
@@ -542,56 +548,60 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
; CHECK-NEXT: .pad #28
; CHECK-NEXT: sub sp, #28
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: strd r1, r2, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: blt .LBB11_5
; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: lsrs r3, r2, #3
+; CHECK-NEXT: mvn r6, #7
+; CHECK-NEXT: add.w r3, r6, r3, lsl #3
; CHECK-NEXT: movs r6, #1
-; CHECK-NEXT: add r2, sp, #12
-; CHECK-NEXT: mov.w r9, #8
-; CHECK-NEXT: bic r1, r1, #7
-; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
-; CHECK-NEXT: sub.w r3, r1, #8
-; CHECK-NEXT: add.w r8, r6, r3, lsr #3
+; CHECK-NEXT: add r4, sp, #12
+; CHECK-NEXT: mov.w r12, #8
+; CHECK-NEXT: add.w r1, r6, r3, lsr #3
; CHECK-NEXT: adr r3, .LCPI11_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB11_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB11_3 Depth 2
-; CHECK-NEXT: dls lr, r8
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT: mov r9, r2
; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: dls lr, r1
+; CHECK-NEXT: bic r1, r2, #7
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: .LBB11_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vstrw.32 q1, [r2]
-; CHECK-NEXT: mov r12, r2
-; CHECK-NEXT: vldrh.s32 q2, [r2, #8]
-; CHECK-NEXT: vadd.i16 q1, q1, r9
+; CHECK-NEXT: vstrw.32 q1, [r4]
+; CHECK-NEXT: mov r11, r4
+; CHECK-NEXT: vldrh.s32 q2, [r4, #8]
+; CHECK-NEXT: vadd.i16 q1, q1, r12
; CHECK-NEXT: vshl.i32 q2, q2, #1
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: vmov r7, r5, d5
-; CHECK-NEXT: vmov r3, r4, d4
-; CHECK-NEXT: vldrh.s32 q2, [r2]
+; CHECK-NEXT: vmov r8, r3, d4
+; CHECK-NEXT: vldrh.s32 q2, [r4]
; CHECK-NEXT: vshl.i32 q2, q2, #1
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: vmov r1, r10, d5
; CHECK-NEXT: ldrh r7, [r7]
-; CHECK-NEXT: ldrh r4, [r4]
+; CHECK-NEXT: ldrh r3, [r3]
; CHECK-NEXT: ldrh r5, [r5]
; CHECK-NEXT: ldrh.w r2, [r10]
-; CHECK-NEXT: ldrh.w r10, [r3]
-; CHECK-NEXT: vmov r3, r11, d4
+; CHECK-NEXT: ldrh.w r10, [r8]
+; CHECK-NEXT: vmov r4, r8, d4
; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: ldrh.w r11, [r11]
-; CHECK-NEXT: vmov.16 q2[0], r3
-; CHECK-NEXT: vmov.16 q2[1], r11
+; CHECK-NEXT: ldrh r4, [r4]
+; CHECK-NEXT: ldrh.w r8, [r8]
+; CHECK-NEXT: vmov.16 q2[0], r4
+; CHECK-NEXT: mov r4, r11
+; CHECK-NEXT: vmov.16 q2[1], r8
; CHECK-NEXT: vmov.16 q2[2], r1
; CHECK-NEXT: vmov.16 q2[3], r2
-; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: vmov.16 q2[4], r10
-; CHECK-NEXT: vmov.16 q2[5], r4
+; CHECK-NEXT: vmov.16 q2[5], r3
; CHECK-NEXT: vmov.16 q2[6], r7
; CHECK-NEXT: vmov.16 q2[7], r5
; CHECK-NEXT: vstrb.8 q2, [r6], #16
@@ -599,8 +609,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1
; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
-; CHECK-NEXT: cmp r3, r1
+; CHECK-NEXT: mov r2, r9
+; CHECK-NEXT: cmp r1, r9
; CHECK-NEXT: bne .LBB11_2
; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #28
@@ -659,139 +669,141 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: .pad #136
; CHECK-NEXT: sub sp, #136
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill
+; CHECK-NEXT: str r1, [sp, #64] @ 4-byte Spill
; CHECK-NEXT: blt.w .LBB12_5
; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload
+; CHECK-NEXT: lsrs r1, r2, #3
+; CHECK-NEXT: mvn r3, #7
+; CHECK-NEXT: add.w r1, r3, r1, lsl #3
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: adr r7, .LCPI12_1
+; CHECK-NEXT: vmov.i16 q2, #0x18
+; CHECK-NEXT: add.w r1, r3, r1, lsr #3
; CHECK-NEXT: adr r3, .LCPI12_2
; CHECK-NEXT: vldrw.u32 q0, [r3]
-; CHECK-NEXT: movs r2, #1
-; CHECK-NEXT: bic r1, r1, #7
-; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: subs r1, #8
-; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill
-; CHECK-NEXT: vmov.i16 q2, #0x18
-; CHECK-NEXT: add.w r1, r2, r1, lsr #3
; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill
; CHECK-NEXT: adr r1, .LCPI12_0
-; CHECK-NEXT: adr r2, .LCPI12_1
+; CHECK-NEXT: mov r12, r0
+; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
+; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r2]
-; CHECK-NEXT: add r2, sp, #120
+; CHECK-NEXT: vldrw.u32 q0, [r7]
+; CHECK-NEXT: add r7, sp, #120
; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill
; CHECK-NEXT: .LBB12_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB12_3 Depth 2
; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT: add.w r10, sp, #104
; CHECK-NEXT: dls lr, r1
-; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload
+; CHECK-NEXT: bic r1, r2, #7
+; CHECK-NEXT: ldr.w r8, [sp, #64] @ 4-byte Reload
; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload
+; CHECK-NEXT: str r1, [sp, #68] @ 4-byte Spill
; CHECK-NEXT: .LBB12_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vstrw.32 q5, [r2]
-; CHECK-NEXT: mov r8, r2
-; CHECK-NEXT: vldrh.s32 q0, [r2, #8]
+; CHECK-NEXT: vstrw.32 q5, [r7]
+; CHECK-NEXT: add r0, sp, #88
+; CHECK-NEXT: vldrh.s32 q0, [r7, #8]
+; CHECK-NEXT: mov r10, r7
; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vadd.i32 q0, q0, r12
; CHECK-NEXT: vmov r1, r3, d0
; CHECK-NEXT: vmov r4, r5, d1
-; CHECK-NEXT: vldrh.s32 q0, [r2]
+; CHECK-NEXT: vldrh.s32 q0, [r7]
; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q2, q0, r0
-; CHECK-NEXT: vmov r6, r2, d4
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: ldrh.w r12, [r4]
-; CHECK-NEXT: add r4, sp, #88
+; CHECK-NEXT: vadd.i32 q2, q0, r12
+; CHECK-NEXT: vmov r6, r9, d4
+; CHECK-NEXT: ldrh r2, [r1]
+; CHECK-NEXT: ldrh r1, [r3]
+; CHECK-NEXT: ldrh r3, [r4]
; CHECK-NEXT: ldrh.w r11, [r5]
-; CHECK-NEXT: ldrh r3, [r3]
; CHECK-NEXT: ldrh r5, [r6]
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: vstrw.32 q6, [r4]
-; CHECK-NEXT: vldrh.s32 q0, [r4]
+; CHECK-NEXT: ldrh.w r4, [r9]
+; CHECK-NEXT: vstrw.32 q6, [r0]
+; CHECK-NEXT: vldrh.s32 q0, [r0]
; CHECK-NEXT: vmov.16 q7[0], r5
-; CHECK-NEXT: vmov.16 q7[1], r2
+; CHECK-NEXT: vmov.16 q7[1], r4
; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r6, r9, d0
-; CHECK-NEXT: vmov r2, r5, d1
-; CHECK-NEXT: vldrh.s32 q0, [r4, #8]
+; CHECK-NEXT: vadd.i32 q0, q0, r12
+; CHECK-NEXT: vmov r6, r7, d0
+; CHECK-NEXT: vmov r4, r5, d1
+; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
+; CHECK-NEXT: add r0, sp, #104
; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vadd.i32 q0, q0, r12
; CHECK-NEXT: ldrh r6, [r6]
-; CHECK-NEXT: ldrh r2, [r2]
+; CHECK-NEXT: ldrh r4, [r4]
; CHECK-NEXT: vmov.16 q1[0], r6
-; CHECK-NEXT: ldrh.w r6, [r9]
+; CHECK-NEXT: ldrh r6, [r7]
; CHECK-NEXT: ldrh r5, [r5]
; CHECK-NEXT: vmov.16 q1[1], r6
-; CHECK-NEXT: vmov.16 q1[2], r2
-; CHECK-NEXT: vmov r2, r6, d0
+; CHECK-NEXT: vmov.16 q1[2], r4
+; CHECK-NEXT: vmov r4, r6, d0
; CHECK-NEXT: vmov.16 q1[3], r5
-; CHECK-NEXT: ldrh r2, [r2]
+; CHECK-NEXT: ldrh r4, [r4]
; CHECK-NEXT: ldrh r6, [r6]
-; CHECK-NEXT: vmov.16 q1[4], r2
-; CHECK-NEXT: vmov r2, r5, d1
+; CHECK-NEXT: vmov.16 q1[4], r4
+; CHECK-NEXT: vmov r4, r5, d1
; CHECK-NEXT: vmov.16 q1[5], r6
-; CHECK-NEXT: mov r6, r10
-; CHECK-NEXT: ldrh r2, [r2]
+; CHECK-NEXT: ldrh r4, [r4]
; CHECK-NEXT: ldrh r5, [r5]
-; CHECK-NEXT: vstrw.32 q4, [r10]
-; CHECK-NEXT: vldrh.s32 q0, [r6]
-; CHECK-NEXT: vmov.16 q1[6], r2
+; CHECK-NEXT: vstrw.32 q4, [r0]
+; CHECK-NEXT: vldrh.s32 q0, [r0]
+; CHECK-NEXT: vmov.16 q1[6], r4
; CHECK-NEXT: vmov.16 q1[7], r5
; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, r5, d0
-; CHECK-NEXT: ldrh r2, [r2]
+; CHECK-NEXT: vadd.i32 q0, q0, r12
+; CHECK-NEXT: vmov r4, r5, d0
+; CHECK-NEXT: ldrh r4, [r4]
; CHECK-NEXT: ldrh r5, [r5]
-; CHECK-NEXT: vmov.16 q3[0], r2
+; CHECK-NEXT: vmov.16 q3[0], r4
+; CHECK-NEXT: vmov r4, r6, d5
; CHECK-NEXT: vmov.16 q3[1], r5
-; CHECK-NEXT: vmov r2, r5, d5
; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
; CHECK-NEXT: vadd.i16 q6, q6, q2
; CHECK-NEXT: vadd.i16 q5, q5, q2
; CHECK-NEXT: vadd.i16 q4, q4, q2
-; CHECK-NEXT: ldrh.w r9, [r2]
-; CHECK-NEXT: vmov r2, r4, d1
-; CHECK-NEXT: vldrh.s32 q0, [r6, #8]
-; CHECK-NEXT: ldrh r5, [r5]
-; CHECK-NEXT: vmov.16 q7[2], r9
+; CHECK-NEXT: ldrh r5, [r4]
+; CHECK-NEXT: ldrh r4, [r6]
+; CHECK-NEXT: vmov r6, r7, d1
+; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
+; CHECK-NEXT: vmov.16 q7[2], r5
+; CHECK-NEXT: vmov.16 q7[3], r4
; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vmov.16 q7[3], r5
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov.16 q7[4], r1
-; CHECK-NEXT: vmov.16 q7[5], r3
-; CHECK-NEXT: vmov.16 q7[6], r12
+; CHECK-NEXT: vmov.16 q7[4], r2
+; CHECK-NEXT: vadd.i32 q0, q0, r12
+; CHECK-NEXT: vmov.16 q7[5], r1
+; CHECK-NEXT: vmov.16 q7[6], r3
; CHECK-NEXT: vmov.16 q7[7], r11
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vmov.16 q3[2], r2
-; CHECK-NEXT: vmov.16 q3[3], r4
-; CHECK-NEXT: vmov r2, r4, d0
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vmov.16 q3[4], r2
-; CHECK-NEXT: vmov.16 q3[5], r4
-; CHECK-NEXT: vmov r2, r4, d1
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vmov.16 q3[6], r2
-; CHECK-NEXT: mov r2, r8
-; CHECK-NEXT: vmov.16 q3[7], r4
+; CHECK-NEXT: ldrh r6, [r6]
+; CHECK-NEXT: ldrh r7, [r7]
+; CHECK-NEXT: vmov.16 q3[2], r6
+; CHECK-NEXT: vmov.16 q3[3], r7
+; CHECK-NEXT: vmov r6, r7, d0
+; CHECK-NEXT: ldrh r6, [r6]
+; CHECK-NEXT: ldrh r7, [r7]
+; CHECK-NEXT: vmov.16 q3[4], r6
+; CHECK-NEXT: vmov.16 q3[5], r7
+; CHECK-NEXT: vmov r6, r7, d1
+; CHECK-NEXT: ldrh r6, [r6]
+; CHECK-NEXT: ldrh r7, [r7]
+; CHECK-NEXT: vmov.16 q3[6], r6
+; CHECK-NEXT: vmov.16 q3[7], r7
+; CHECK-NEXT: mov r7, r10
; CHECK-NEXT: vadd.i16 q0, q3, q1
; CHECK-NEXT: vadd.i16 q0, q0, q7
-; CHECK-NEXT: vstrb.8 q0, [r7], #16
+; CHECK-NEXT: vstrb.8 q0, [r8], #16
; CHECK-NEXT: le lr, .LBB12_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload
-; CHECK-NEXT: cmp r1, r3
+; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload
+; CHECK-NEXT: cmp r1, r2
; CHECK-NEXT: bne.w .LBB12_2
; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #136
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
index 71384b672bb13d..2b0218b4ab90b9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
@@ -8,33 +8,35 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture reado
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB0_1: @ %vector.ph.preheader
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: bic r12, r2, #3
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: lsr.w r12, r2, #2
+; CHECK-NEXT: mvn r3, #3
+; CHECK-NEXT: add.w r12, r3, r12, lsl #2
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: sub.w lr, r12, #4
-; CHECK-NEXT: add.w r4, r3, lr, lsr #2
+; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: adr r3, .LCPI0_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: .LBB0_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
-; CHECK-NEXT: dls lr, r4
-; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: dls lr, r12
+; CHECK-NEXT: bic r0, r2, #3
+; CHECK-NEXT: mov r3, r1
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: .LBB0_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q2, [q1, #16]!
-; CHECK-NEXT: vstrb.8 q2, [r0], #16
+; CHECK-NEXT: vstrb.8 q2, [r3], #16
; CHECK-NEXT: le lr, .LBB0_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT: cmp r12, r2
+; CHECK-NEXT: cmp r0, r2
; CHECK-NEXT: bne .LBB0_2
; CHECK-NEXT: @ %bb.5:
-; CHECK-NEXT: pop.w {r4, lr}
+; CHECK-NEXT: pop.w {r7, lr}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
index e845070d579045..bf4b3b6b33ebcd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
@@ -130,32 +130,34 @@ define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i3
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB3_1: @ %vector.ph.preheader
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
-; CHECK-NEXT: adr r4, .LCPI3_2
-; CHECK-NEXT: bic r2, r1, #3
-; CHECK-NEXT: vldrw.u32 q3, [r4]
-; CHECK-NEXT: sub.w r12, r2, #4
-; CHECK-NEXT: adr.w lr, .LCPI3_1
+; CHECK-NEXT: lsrs r2, r1, #2
+; CHECK-NEXT: mvn r3, #3
+; CHECK-NEXT: add.w r2, r3, r2, lsl #2
; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: adr.w lr, .LCPI3_1
+; CHECK-NEXT: adr.w r12, .LCPI3_0
+; CHECK-NEXT: add.w r2, r3, r2, lsr #2
+; CHECK-NEXT: adr r3, .LCPI3_2
+; CHECK-NEXT: vldrw.u32 q3, [r3]
; CHECK-NEXT: vadd.i32 q3, q3, r0
-; CHECK-NEXT: add.w r3, r3, r12, lsr #2
; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q3, [lr]
-; CHECK-NEXT: adr.w r12, .LCPI3_0
; CHECK-NEXT: vadd.i32 q4, q3, r0
; CHECK-NEXT: vldrw.u32 q3, [r12]
; CHECK-NEXT: vadd.i32 q3, q3, r0
; CHECK-NEXT: .LBB3_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
-; CHECK-NEXT: dls lr, r3
-; CHECK-NEXT: vmov q6, q4
+; CHECK-NEXT: dls lr, r2
+; CHECK-NEXT: bic r0, r1, #3
; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov q6, q4
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: .LBB3_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
@@ -166,12 +168,12 @@ define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i3
; CHECK-NEXT: le lr, .LBB3_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
-; CHECK-NEXT: cmp r2, r1
+; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: bne .LBB3_2
; CHECK-NEXT: @ %bb.5:
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: pop.w {r4, lr}
+; CHECK-NEXT: pop.w {r7, lr}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
diff --git a/llvm/test/CodeGen/Thumb2/shift_parts.ll b/llvm/test/CodeGen/Thumb2/shift_parts.ll
index b4ac405d82ed50..6962e28889f213 100644
--- a/llvm/test/CodeGen/Thumb2/shift_parts.ll
+++ b/llvm/test/CodeGen/Thumb2/shift_parts.ll
@@ -493,10 +493,16 @@ entry:
}
define i32 @lsl_demand_topmask(i64 %x) {
-; CHECK-LABEL: lsl_demand_topmask:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: ubfx r0, r0, #1, #28
-; CHECK-NEXT: bx lr
+; CHECK-MVE-LABEL: lsl_demand_topmask:
+; CHECK-MVE: @ %bb.0: @ %entry
+; CHECK-MVE-NEXT: lsll r0, r1, #31
+; CHECK-MVE-NEXT: bic r0, r1, #-268435456
+; CHECK-MVE-NEXT: bx lr
+;
+; CHECK-NON-MVE-LABEL: lsl_demand_topmask:
+; CHECK-NON-MVE: @ %bb.0: @ %entry
+; CHECK-NON-MVE-NEXT: ubfx r0, r0, #1, #28
+; CHECK-NON-MVE-NEXT: bx lr
entry:
%sh = shl i64 %x, 31
%a = and i64 %sh, 1152921500311879680 ;0x0fffffff00000000
diff --git a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
index 58bafebd5b702f..e4abbe49964823 100644
--- a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
@@ -26,9 +26,11 @@ define i1 @test_srem_even(i4 %X) nounwind {
; CHECK-LABEL: test_srem_even:
; CHECK: @ %bb.0:
; CHECK-NEXT: sbfx r1, r0, #0, #4
+; CHECK-NEXT: movs r2, #8
; CHECK-NEXT: add.w r1, r1, r1, lsl #1
-; CHECK-NEXT: ubfx r2, r1, #7, #1
-; CHECK-NEXT: add.w r1, r2, r1, lsr #4
+; CHECK-NEXT: and.w r2, r2, r1, lsr #4
+; CHECK-NEXT: lsrs r1, r1, #4
+; CHECK-NEXT: add.w r1, r1, r2, lsr #3
; CHECK-NEXT: add.w r1, r1, r1, lsl #1
; CHECK-NEXT: sub.w r0, r0, r1, lsl #1
; CHECK-NEXT: and r0, r0, #15
@@ -45,10 +47,11 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; CHECK-LABEL: test_srem_pow2_setne:
; CHECK: @ %bb.0:
; CHECK-NEXT: sbfx r1, r0, #0, #6
-; CHECK-NEXT: ubfx r1, r1, #9, #2
-; CHECK-NEXT: add r1, r0
-; CHECK-NEXT: and r1, r1, #60
-; CHECK-NEXT: subs r0, r0, r1
+; CHECK-NEXT: movs r2, #48
+; CHECK-NEXT: and.w r1, r2, r1, lsr #5
+; CHECK-NEXT: add.w r1, r0, r1, lsr #4
+; CHECK-NEXT: lsrs r1, r1, #2
+; CHECK-NEXT: sub.w r0, r0, r1, lsl #2
; CHECK-NEXT: ands r0, r0, #63
; CHECK-NEXT: it ne
; CHECK-NEXT: movne r0, #1
diff --git a/llvm/test/CodeGen/VE/Scalar/bitreverse.ll b/llvm/test/CodeGen/VE/Scalar/bitreverse.ll
index e95f10e85de452..e1a31258f62937 100644
--- a/llvm/test/CodeGen/VE/Scalar/bitreverse.ll
+++ b/llvm/test/CodeGen/VE/Scalar/bitreverse.ll
@@ -61,7 +61,11 @@ define zeroext i16 @func16z(i16 zeroext %p) {
; CHECK-LABEL: func16z:
; CHECK: # %bb.0:
; CHECK-NEXT: brv %s0, %s0
-; CHECK-NEXT: srl %s0, %s0, 48
+; CHECK-NEXT: srl %s0, %s0, 32
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: srl %s0, %s0, 16
+; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1
; CHECK-NEXT: b.l.t (, %s10)
%r = tail call i16 @llvm.bitreverse.i16(i16 %p)
ret i16 %r
@@ -81,7 +85,11 @@ define zeroext i8 @func8z(i8 zeroext %p) {
; CHECK-LABEL: func8z:
; CHECK: # %bb.0:
; CHECK-NEXT: brv %s0, %s0
-; CHECK-NEXT: srl %s0, %s0, 56
+; CHECK-NEXT: srl %s0, %s0, 32
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: srl %s0, %s0, 24
+; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1
; CHECK-NEXT: b.l.t (, %s10)
%r = tail call i8 @llvm.bitreverse.i8(i8 %p)
ret i8 %r
diff --git a/llvm/test/CodeGen/WebAssembly/pr47375.ll b/llvm/test/CodeGen/WebAssembly/pr47375.ll
index 400380fb207722..9e26018b94a150 100644
--- a/llvm/test/CodeGen/WebAssembly/pr47375.ll
+++ b/llvm/test/CodeGen/WebAssembly/pr47375.ll
@@ -20,12 +20,14 @@ define void @sext_vec() {
; CHECK-NEXT: i32.store8 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 7
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32.const 3
+; CHECK-NEXT: i32.shr_u
+; CHECK-NEXT: i32.const 10
; CHECK-NEXT: i32.shl
; CHECK-NEXT: i32.or
-; CHECK-NEXT: i32.const 7175
-; CHECK-NEXT: i32.and
; CHECK-NEXT: i32.store16 0
; CHECK-NEXT: # fallthrough-return
%L1 = load <2 x i3>, ptr undef, align 2
diff --git a/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
index ca160c091b2293..9f5370462cc4bc 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
@@ -362,39 +362,39 @@ define i32 @bitmask_v32i8(<32 x i8> %v) {
; CHECK-NEXT: i32.const 21
; CHECK-NEXT: i32.shl
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 4
+; CHECK-NEXT: i8x16.extract_lane_u 0
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 20
-; CHECK-NEXT: i32.shl
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 3
+; CHECK-NEXT: i8x16.extract_lane_u 1
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 19
+; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.extract_lane_u 2
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 18
+; CHECK-NEXT: i32.const 2
; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 1
+; CHECK-NEXT: i8x16.extract_lane_u 3
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 17
+; CHECK-NEXT: i32.const 3
; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 0
+; CHECK-NEXT: i8x16.extract_lane_u 4
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 16
+; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.shl
; CHECK-NEXT: i32.or
-; CHECK-NEXT: i32.or
-; CHECK-NEXT: i32.or
-; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.const 16
+; CHECK-NEXT: i32.shl
; CHECK-NEXT: i32.or
; CHECK-NEXT: i32.or
; CHECK-NEXT: i32.or
diff --git a/llvm/test/CodeGen/WebAssembly/simd-pr61780.ll b/llvm/test/CodeGen/WebAssembly/simd-pr61780.ll
index 99e6e5cc3fd4c1..9635ee238aec43 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-pr61780.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-pr61780.ll
@@ -17,7 +17,7 @@ define void @f(ptr %0, ptr %pr) {
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.shr_u
+; CHECK-NEXT: i32.shr_s
; CHECK-NEXT: local.tee 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.mul
@@ -27,7 +27,7 @@ define void @f(ptr %0, ptr %pr) {
; CHECK-NEXT: i32x4.extract_lane 1
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.shr_u
+; CHECK-NEXT: i32.shr_s
; CHECK-NEXT: local.tee 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.mul
diff --git a/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll b/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll
index e6497bca98dc27..1915d60fb76cd5 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll
@@ -690,183 +690,184 @@ define i1 @test_all_v64i8(<64 x i8> %x) {
; CHECK-LABEL: test_all_v64i8:
; CHECK: .functype test_all_v64i8 (v128, v128, v128, v128) -> (i32)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: global.get $push287=, __stack_pointer
-; CHECK-NEXT: i32.const $push288=, 16
-; CHECK-NEXT: i32.sub $drop=, $pop287, $pop288
-; CHECK-NEXT: i8x16.extract_lane_u $push220=, $2, 0
+; CHECK-NEXT: global.get $push283=, __stack_pointer
+; CHECK-NEXT: i32.const $push284=, 16
+; CHECK-NEXT: i32.sub $drop=, $pop283, $pop284
+; CHECK-NEXT: i8x16.extract_lane_u $push216=, $2, 0
; CHECK-NEXT: i32.const $push1=, 1
-; CHECK-NEXT: i32.and $push221=, $pop220, $pop1
-; CHECK-NEXT: i8x16.extract_lane_u $push222=, $2, 1
+; CHECK-NEXT: i32.and $push217=, $pop216, $pop1
+; CHECK-NEXT: i8x16.extract_lane_u $push218=, $2, 1
; CHECK-NEXT: i32.const $push380=, 1
-; CHECK-NEXT: i32.and $push223=, $pop222, $pop380
+; CHECK-NEXT: i32.and $push219=, $pop218, $pop380
; CHECK-NEXT: i32.const $push379=, 1
-; CHECK-NEXT: i32.shl $push224=, $pop223, $pop379
-; CHECK-NEXT: i32.or $push225=, $pop221, $pop224
-; CHECK-NEXT: i8x16.extract_lane_u $push226=, $2, 2
+; CHECK-NEXT: i32.shl $push220=, $pop219, $pop379
+; CHECK-NEXT: i32.or $push221=, $pop217, $pop220
+; CHECK-NEXT: i8x16.extract_lane_u $push222=, $2, 2
; CHECK-NEXT: i32.const $push378=, 1
-; CHECK-NEXT: i32.and $push227=, $pop226, $pop378
-; CHECK-NEXT: i32.const $push87=, 2
-; CHECK-NEXT: i32.shl $push228=, $pop227, $pop87
-; CHECK-NEXT: i32.or $push229=, $pop225, $pop228
-; CHECK-NEXT: i8x16.extract_lane_u $push230=, $2, 3
+; CHECK-NEXT: i32.and $push223=, $pop222, $pop378
+; CHECK-NEXT: i32.const $push9=, 2
+; CHECK-NEXT: i32.shl $push224=, $pop223, $pop9
+; CHECK-NEXT: i32.or $push225=, $pop221, $pop224
+; CHECK-NEXT: i8x16.extract_lane_u $push226=, $2, 3
; CHECK-NEXT: i32.const $push377=, 1
-; CHECK-NEXT: i32.and $push231=, $pop230, $pop377
-; CHECK-NEXT: i32.const $push92=, 3
-; CHECK-NEXT: i32.shl $push232=, $pop231, $pop92
-; CHECK-NEXT: i32.or $push233=, $pop229, $pop232
-; CHECK-NEXT: i8x16.extract_lane_u $push234=, $2, 4
+; CHECK-NEXT: i32.and $push227=, $pop226, $pop377
+; CHECK-NEXT: i32.const $push14=, 3
+; CHECK-NEXT: i32.shl $push228=, $pop227, $pop14
+; CHECK-NEXT: i32.or $push229=, $pop225, $pop228
+; CHECK-NEXT: i8x16.extract_lane_u $push230=, $2, 4
; CHECK-NEXT: i32.const $push376=, 1
-; CHECK-NEXT: i32.and $push235=, $pop234, $pop376
-; CHECK-NEXT: i32.const $push97=, 4
-; CHECK-NEXT: i32.shl $push236=, $pop235, $pop97
-; CHECK-NEXT: i32.or $push237=, $pop233, $pop236
-; CHECK-NEXT: i8x16.extract_lane_u $push238=, $2, 5
+; CHECK-NEXT: i32.and $push231=, $pop230, $pop376
+; CHECK-NEXT: i32.const $push19=, 4
+; CHECK-NEXT: i32.shl $push232=, $pop231, $pop19
+; CHECK-NEXT: i32.or $push233=, $pop229, $pop232
+; CHECK-NEXT: i8x16.extract_lane_u $push234=, $2, 5
; CHECK-NEXT: i32.const $push375=, 1
-; CHECK-NEXT: i32.and $push239=, $pop238, $pop375
-; CHECK-NEXT: i32.const $push102=, 5
-; CHECK-NEXT: i32.shl $push240=, $pop239, $pop102
-; CHECK-NEXT: i32.or $push241=, $pop237, $pop240
-; CHECK-NEXT: i8x16.extract_lane_u $push242=, $2, 6
+; CHECK-NEXT: i32.and $push235=, $pop234, $pop375
+; CHECK-NEXT: i32.const $push98=, 5
+; CHECK-NEXT: i32.shl $push236=, $pop235, $pop98
+; CHECK-NEXT: i32.or $push237=, $pop233, $pop236
+; CHECK-NEXT: i8x16.extract_lane_u $push238=, $2, 6
; CHECK-NEXT: i32.const $push374=, 1
-; CHECK-NEXT: i32.and $push243=, $pop242, $pop374
-; CHECK-NEXT: i32.const $push107=, 6
-; CHECK-NEXT: i32.shl $push244=, $pop243, $pop107
-; CHECK-NEXT: i32.or $push245=, $pop241, $pop244
-; CHECK-NEXT: i8x16.extract_lane_u $push246=, $2, 7
+; CHECK-NEXT: i32.and $push239=, $pop238, $pop374
+; CHECK-NEXT: i32.const $push103=, 6
+; CHECK-NEXT: i32.shl $push240=, $pop239, $pop103
+; CHECK-NEXT: i32.or $push241=, $pop237, $pop240
+; CHECK-NEXT: i8x16.extract_lane_u $push242=, $2, 7
; CHECK-NEXT: i32.const $push373=, 1
-; CHECK-NEXT: i32.and $push247=, $pop246, $pop373
-; CHECK-NEXT: i32.const $push112=, 7
-; CHECK-NEXT: i32.shl $push248=, $pop247, $pop112
-; CHECK-NEXT: i32.or $push249=, $pop245, $pop248
-; CHECK-NEXT: i8x16.extract_lane_u $push250=, $2, 8
+; CHECK-NEXT: i32.and $push243=, $pop242, $pop373
+; CHECK-NEXT: i32.const $push108=, 7
+; CHECK-NEXT: i32.shl $push244=, $pop243, $pop108
+; CHECK-NEXT: i32.or $push245=, $pop241, $pop244
+; CHECK-NEXT: i8x16.extract_lane_u $push246=, $2, 8
; CHECK-NEXT: i32.const $push372=, 1
-; CHECK-NEXT: i32.and $push251=, $pop250, $pop372
-; CHECK-NEXT: i32.const $push117=, 8
-; CHECK-NEXT: i32.shl $push252=, $pop251, $pop117
-; CHECK-NEXT: i32.or $push253=, $pop249, $pop252
-; CHECK-NEXT: i8x16.extract_lane_u $push254=, $2, 9
+; CHECK-NEXT: i32.and $push247=, $pop246, $pop372
+; CHECK-NEXT: i32.const $push113=, 8
+; CHECK-NEXT: i32.shl $push248=, $pop247, $pop113
+; CHECK-NEXT: i32.or $push249=, $pop245, $pop248
+; CHECK-NEXT: i8x16.extract_lane_u $push250=, $2, 9
; CHECK-NEXT: i32.const $push371=, 1
-; CHECK-NEXT: i32.and $push255=, $pop254, $pop371
-; CHECK-NEXT: i32.const $push122=, 9
-; CHECK-NEXT: i32.shl $push256=, $pop255, $pop122
-; CHECK-NEXT: i32.or $push257=, $pop253, $pop256
-; CHECK-NEXT: i8x16.extract_lane_u $push258=, $2, 10
+; CHECK-NEXT: i32.and $push251=, $pop250, $pop371
+; CHECK-NEXT: i32.const $push118=, 9
+; CHECK-NEXT: i32.shl $push252=, $pop251, $pop118
+; CHECK-NEXT: i32.or $push253=, $pop249, $pop252
+; CHECK-NEXT: i8x16.extract_lane_u $push254=, $2, 10
; CHECK-NEXT: i32.const $push370=, 1
-; CHECK-NEXT: i32.and $push259=, $pop258, $pop370
-; CHECK-NEXT: i32.const $push127=, 10
-; CHECK-NEXT: i32.shl $push260=, $pop259, $pop127
-; CHECK-NEXT: i32.or $push261=, $pop257, $pop260
-; CHECK-NEXT: i8x16.extract_lane_u $push262=, $2, 11
+; CHECK-NEXT: i32.and $push255=, $pop254, $pop370
+; CHECK-NEXT: i32.const $push123=, 10
+; CHECK-NEXT: i32.shl $push256=, $pop255, $pop123
+; CHECK-NEXT: i32.or $push257=, $pop253, $pop256
+; CHECK-NEXT: i8x16.extract_lane_u $push258=, $2, 11
; CHECK-NEXT: i32.const $push369=, 1
-; CHECK-NEXT: i32.and $push263=, $pop262, $pop369
-; CHECK-NEXT: i32.const $push132=, 11
-; CHECK-NEXT: i32.shl $push264=, $pop263, $pop132
-; CHECK-NEXT: i32.or $push265=, $pop261, $pop264
-; CHECK-NEXT: i8x16.extract_lane_u $push266=, $2, 12
+; CHECK-NEXT: i32.and $push259=, $pop258, $pop369
+; CHECK-NEXT: i32.const $push128=, 11
+; CHECK-NEXT: i32.shl $push260=, $pop259, $pop128
+; CHECK-NEXT: i32.or $push261=, $pop257, $pop260
+; CHECK-NEXT: i8x16.extract_lane_u $push262=, $2, 12
; CHECK-NEXT: i32.const $push368=, 1
-; CHECK-NEXT: i32.and $push267=, $pop266, $pop368
-; CHECK-NEXT: i32.const $push137=, 12
-; CHECK-NEXT: i32.shl $push268=, $pop267, $pop137
-; CHECK-NEXT: i32.or $push269=, $pop265, $pop268
-; CHECK-NEXT: i8x16.extract_lane_u $push270=, $2, 13
+; CHECK-NEXT: i32.and $push263=, $pop262, $pop368
+; CHECK-NEXT: i32.const $push133=, 12
+; CHECK-NEXT: i32.shl $push264=, $pop263, $pop133
+; CHECK-NEXT: i32.or $push265=, $pop261, $pop264
+; CHECK-NEXT: i8x16.extract_lane_u $push266=, $2, 13
; CHECK-NEXT: i32.const $push367=, 1
-; CHECK-NEXT: i32.and $push271=, $pop270, $pop367
-; CHECK-NEXT: i32.const $push142=, 13
-; CHECK-NEXT: i32.shl $push272=, $pop271, $pop142
-; CHECK-NEXT: i32.or $push273=, $pop269, $pop272
-; CHECK-NEXT: i8x16.extract_lane_u $push274=, $2, 14
+; CHECK-NEXT: i32.and $push267=, $pop266, $pop367
+; CHECK-NEXT: i32.const $push138=, 13
+; CHECK-NEXT: i32.shl $push268=, $pop267, $pop138
+; CHECK-NEXT: i32.or $push269=, $pop265, $pop268
+; CHECK-NEXT: i8x16.extract_lane_u $push270=, $2, 14
; CHECK-NEXT: i32.const $push366=, 1
-; CHECK-NEXT: i32.and $push275=, $pop274, $pop366
-; CHECK-NEXT: i32.const $push147=, 14
-; CHECK-NEXT: i32.shl $push276=, $pop275, $pop147
-; CHECK-NEXT: i32.or $push277=, $pop273, $pop276
-; CHECK-NEXT: i8x16.extract_lane_u $push278=, $2, 15
-; CHECK-NEXT: i32.const $push151=, 15
-; CHECK-NEXT: i32.shl $push279=, $pop278, $pop151
-; CHECK-NEXT: i32.or $push280=, $pop277, $pop279
-; CHECK-NEXT: i32.const $push154=, 65535
-; CHECK-NEXT: i32.and $push281=, $pop280, $pop154
-; CHECK-NEXT: i8x16.extract_lane_u $push217=, $3, 15
-; CHECK-NEXT: i32.const $push76=, 31
-; CHECK-NEXT: i32.shl $push218=, $pop217, $pop76
-; CHECK-NEXT: i8x16.extract_lane_u $push213=, $3, 14
+; CHECK-NEXT: i32.and $push271=, $pop270, $pop366
+; CHECK-NEXT: i32.const $push143=, 14
+; CHECK-NEXT: i32.shl $push272=, $pop271, $pop143
+; CHECK-NEXT: i32.or $push273=, $pop269, $pop272
+; CHECK-NEXT: i8x16.extract_lane_u $push274=, $2, 15
+; CHECK-NEXT: i32.const $push147=, 15
+; CHECK-NEXT: i32.shl $push275=, $pop274, $pop147
+; CHECK-NEXT: i32.or $push276=, $pop273, $pop275
+; CHECK-NEXT: i32.const $push150=, 65535
+; CHECK-NEXT: i32.and $push277=, $pop276, $pop150
+; CHECK-NEXT: i8x16.extract_lane_u $push213=, $3, 15
+; CHECK-NEXT: i32.const $push75=, 31
+; CHECK-NEXT: i32.shl $push214=, $pop213, $pop75
+; CHECK-NEXT: i8x16.extract_lane_u $push209=, $3, 14
; CHECK-NEXT: i32.const $push365=, 1
-; CHECK-NEXT: i32.and $push214=, $pop213, $pop365
-; CHECK-NEXT: i32.const $push72=, 30
-; CHECK-NEXT: i32.shl $push215=, $pop214, $pop72
-; CHECK-NEXT: i8x16.extract_lane_u $push209=, $3, 13
+; CHECK-NEXT: i32.and $push210=, $pop209, $pop365
+; CHECK-NEXT: i32.const $push71=, 30
+; CHECK-NEXT: i32.shl $push211=, $pop210, $pop71
+; CHECK-NEXT: i8x16.extract_lane_u $push205=, $3, 13
; CHECK-NEXT: i32.const $push364=, 1
-; CHECK-NEXT: i32.and $push210=, $pop209, $pop364
-; CHECK-NEXT: i32.const $push67=, 29
-; CHECK-NEXT: i32.shl $push211=, $pop210, $pop67
-; CHECK-NEXT: i8x16.extract_lane_u $push205=, $3, 12
+; CHECK-NEXT: i32.and $push206=, $pop205, $pop364
+; CHECK-NEXT: i32.const $push66=, 29
+; CHECK-NEXT: i32.shl $push207=, $pop206, $pop66
+; CHECK-NEXT: i8x16.extract_lane_u $push201=, $3, 12
; CHECK-NEXT: i32.const $push363=, 1
-; CHECK-NEXT: i32.and $push206=, $pop205, $pop363
-; CHECK-NEXT: i32.const $push62=, 28
-; CHECK-NEXT: i32.shl $push207=, $pop206, $pop62
-; CHECK-NEXT: i8x16.extract_lane_u $push201=, $3, 11
+; CHECK-NEXT: i32.and $push202=, $pop201, $pop363
+; CHECK-NEXT: i32.const $push61=, 28
+; CHECK-NEXT: i32.shl $push203=, $pop202, $pop61
+; CHECK-NEXT: i8x16.extract_lane_u $push197=, $3, 11
; CHECK-NEXT: i32.const $push362=, 1
-; CHECK-NEXT: i32.and $push202=, $pop201, $pop362
-; CHECK-NEXT: i32.const $push57=, 27
-; CHECK-NEXT: i32.shl $push203=, $pop202, $pop57
-; CHECK-NEXT: i8x16.extract_lane_u $push197=, $3, 10
+; CHECK-NEXT: i32.and $push198=, $pop197, $pop362
+; CHECK-NEXT: i32.const $push56=, 27
+; CHECK-NEXT: i32.shl $push199=, $pop198, $pop56
+; CHECK-NEXT: i8x16.extract_lane_u $push193=, $3, 10
; CHECK-NEXT: i32.const $push361=, 1
-; CHECK-NEXT: i32.and $push198=, $pop197, $pop361
-; CHECK-NEXT: i32.const $push52=, 26
-; CHECK-NEXT: i32.shl $push199=, $pop198, $pop52
-; CHECK-NEXT: i8x16.extract_lane_u $push193=, $3, 9
+; CHECK-NEXT: i32.and $push194=, $pop193, $pop361
+; CHECK-NEXT: i32.const $push51=, 26
+; CHECK-NEXT: i32.shl $push195=, $pop194, $pop51
+; CHECK-NEXT: i8x16.extract_lane_u $push189=, $3, 9
; CHECK-NEXT: i32.const $push360=, 1
-; CHECK-NEXT: i32.and $push194=, $pop193, $pop360
-; CHECK-NEXT: i32.const $push47=, 25
-; CHECK-NEXT: i32.shl $push195=, $pop194, $pop47
-; CHECK-NEXT: i8x16.extract_lane_u $push189=, $3, 8
+; CHECK-NEXT: i32.and $push190=, $pop189, $pop360
+; CHECK-NEXT: i32.const $push46=, 25
+; CHECK-NEXT: i32.shl $push191=, $pop190, $pop46
+; CHECK-NEXT: i8x16.extract_lane_u $push185=, $3, 8
; CHECK-NEXT: i32.const $push359=, 1
-; CHECK-NEXT: i32.and $push190=, $pop189, $pop359
-; CHECK-NEXT: i32.const $push42=, 24
-; CHECK-NEXT: i32.shl $push191=, $pop190, $pop42
-; CHECK-NEXT: i8x16.extract_lane_u $push185=, $3, 7
+; CHECK-NEXT: i32.and $push186=, $pop185, $pop359
+; CHECK-NEXT: i32.const $push41=, 24
+; CHECK-NEXT: i32.shl $push187=, $pop186, $pop41
+; CHECK-NEXT: i8x16.extract_lane_u $push181=, $3, 7
; CHECK-NEXT: i32.const $push358=, 1
-; CHECK-NEXT: i32.and $push186=, $pop185, $pop358
-; CHECK-NEXT: i32.const $push37=, 23
-; CHECK-NEXT: i32.shl $push187=, $pop186, $pop37
-; CHECK-NEXT: i8x16.extract_lane_u $push181=, $3, 6
+; CHECK-NEXT: i32.and $push182=, $pop181, $pop358
+; CHECK-NEXT: i32.const $push36=, 23
+; CHECK-NEXT: i32.shl $push183=, $pop182, $pop36
+; CHECK-NEXT: i8x16.extract_lane_u $push177=, $3, 6
; CHECK-NEXT: i32.const $push357=, 1
-; CHECK-NEXT: i32.and $push182=, $pop181, $pop357
-; CHECK-NEXT: i32.const $push32=, 22
-; CHECK-NEXT: i32.shl $push183=, $pop182, $pop32
-; CHECK-NEXT: i8x16.extract_lane_u $push177=, $3, 5
+; CHECK-NEXT: i32.and $push178=, $pop177, $pop357
+; CHECK-NEXT: i32.const $push31=, 22
+; CHECK-NEXT: i32.shl $push179=, $pop178, $pop31
+; CHECK-NEXT: i8x16.extract_lane_u $push173=, $3, 5
; CHECK-NEXT: i32.const $push356=, 1
-; CHECK-NEXT: i32.and $push178=, $pop177, $pop356
-; CHECK-NEXT: i32.const $push27=, 21
-; CHECK-NEXT: i32.shl $push179=, $pop178, $pop27
-; CHECK-NEXT: i8x16.extract_lane_u $push173=, $3, 4
+; CHECK-NEXT: i32.and $push174=, $pop173, $pop356
+; CHECK-NEXT: i32.const $push26=, 21
+; CHECK-NEXT: i32.shl $push175=, $pop174, $pop26
+; CHECK-NEXT: i8x16.extract_lane_u $push154=, $3, 0
; CHECK-NEXT: i32.const $push355=, 1
-; CHECK-NEXT: i32.and $push174=, $pop173, $pop355
-; CHECK-NEXT: i32.const $push22=, 20
-; CHECK-NEXT: i32.shl $push175=, $pop174, $pop22
-; CHECK-NEXT: i8x16.extract_lane_u $push169=, $3, 3
+; CHECK-NEXT: i32.and $push155=, $pop154, $pop355
+; CHECK-NEXT: i8x16.extract_lane_u $push156=, $3, 1
; CHECK-NEXT: i32.const $push354=, 1
-; CHECK-NEXT: i32.and $push170=, $pop169, $pop354
-; CHECK-NEXT: i32.const $push17=, 19
-; CHECK-NEXT: i32.shl $push171=, $pop170, $pop17
-; CHECK-NEXT: i8x16.extract_lane_u $push165=, $3, 2
+; CHECK-NEXT: i32.and $push157=, $pop156, $pop354
; CHECK-NEXT: i32.const $push353=, 1
-; CHECK-NEXT: i32.and $push166=, $pop165, $pop353
-; CHECK-NEXT: i32.const $push12=, 18
-; CHECK-NEXT: i32.shl $push167=, $pop166, $pop12
-; CHECK-NEXT: i8x16.extract_lane_u $push161=, $3, 1
+; CHECK-NEXT: i32.shl $push158=, $pop157, $pop353
+; CHECK-NEXT: i32.or $push159=, $pop155, $pop158
+; CHECK-NEXT: i8x16.extract_lane_u $push160=, $3, 2
; CHECK-NEXT: i32.const $push352=, 1
-; CHECK-NEXT: i32.and $push162=, $pop161, $pop352
-; CHECK-NEXT: i32.const $push7=, 17
-; CHECK-NEXT: i32.shl $push163=, $pop162, $pop7
-; CHECK-NEXT: i8x16.extract_lane_u $push158=, $3, 0
-; CHECK-NEXT: i32.const $push351=, 1
-; CHECK-NEXT: i32.and $push159=, $pop158, $pop351
-; CHECK-NEXT: i32.const $push3=, 16
-; CHECK-NEXT: i32.shl $push160=, $pop159, $pop3
-; CHECK-NEXT: i32.or $push164=, $pop163, $pop160
-; CHECK-NEXT: i32.or $push168=, $pop167, $pop164
-; CHECK-NEXT: i32.or $push172=, $pop171, $pop168
+; CHECK-NEXT: i32.and $push161=, $pop160, $pop352
+; CHECK-NEXT: i32.const $push351=, 2
+; CHECK-NEXT: i32.shl $push162=, $pop161, $pop351
+; CHECK-NEXT: i32.or $push163=, $pop159, $pop162
+; CHECK-NEXT: i8x16.extract_lane_u $push164=, $3, 3
+; CHECK-NEXT: i32.const $push350=, 1
+; CHECK-NEXT: i32.and $push165=, $pop164, $pop350
+; CHECK-NEXT: i32.const $push349=, 3
+; CHECK-NEXT: i32.shl $push166=, $pop165, $pop349
+; CHECK-NEXT: i32.or $push167=, $pop163, $pop166
+; CHECK-NEXT: i8x16.extract_lane_u $push168=, $3, 4
+; CHECK-NEXT: i32.const $push348=, 1
+; CHECK-NEXT: i32.and $push169=, $pop168, $pop348
+; CHECK-NEXT: i32.const $push347=, 4
+; CHECK-NEXT: i32.shl $push170=, $pop169, $pop347
+; CHECK-NEXT: i32.or $push171=, $pop167, $pop170
+; CHECK-NEXT: i32.const $push22=, 16
+; CHECK-NEXT: i32.shl $push172=, $pop171, $pop22
; CHECK-NEXT: i32.or $push176=, $pop175, $pop172
; CHECK-NEXT: i32.or $push180=, $pop179, $pop176
; CHECK-NEXT: i32.or $push184=, $pop183, $pop180
@@ -877,202 +878,201 @@ define i1 @test_all_v64i8(<64 x i8> %x) {
; CHECK-NEXT: i32.or $push204=, $pop203, $pop200
; CHECK-NEXT: i32.or $push208=, $pop207, $pop204
; CHECK-NEXT: i32.or $push212=, $pop211, $pop208
-; CHECK-NEXT: i32.or $push216=, $pop215, $pop212
-; CHECK-NEXT: i32.or $push219=, $pop218, $pop216
-; CHECK-NEXT: i32.or $push282=, $pop281, $pop219
-; CHECK-NEXT: i64.extend_i32_u $push283=, $pop282
-; CHECK-NEXT: i8x16.extract_lane_u $push79=, $0, 0
-; CHECK-NEXT: i32.const $push350=, 1
-; CHECK-NEXT: i32.and $push80=, $pop79, $pop350
-; CHECK-NEXT: i8x16.extract_lane_u $push81=, $0, 1
-; CHECK-NEXT: i32.const $push349=, 1
-; CHECK-NEXT: i32.and $push82=, $pop81, $pop349
-; CHECK-NEXT: i32.const $push348=, 1
-; CHECK-NEXT: i32.shl $push83=, $pop82, $pop348
-; CHECK-NEXT: i32.or $push84=, $pop80, $pop83
-; CHECK-NEXT: i8x16.extract_lane_u $push85=, $0, 2
-; CHECK-NEXT: i32.const $push347=, 1
-; CHECK-NEXT: i32.and $push86=, $pop85, $pop347
-; CHECK-NEXT: i32.const $push346=, 2
-; CHECK-NEXT: i32.shl $push88=, $pop86, $pop346
-; CHECK-NEXT: i32.or $push89=, $pop84, $pop88
-; CHECK-NEXT: i8x16.extract_lane_u $push90=, $0, 3
+; CHECK-NEXT: i32.or $push215=, $pop214, $pop212
+; CHECK-NEXT: i32.or $push278=, $pop277, $pop215
+; CHECK-NEXT: i64.extend_i32_u $push279=, $pop278
+; CHECK-NEXT: i8x16.extract_lane_u $push78=, $0, 0
+; CHECK-NEXT: i32.const $push346=, 1
+; CHECK-NEXT: i32.and $push79=, $pop78, $pop346
+; CHECK-NEXT: i8x16.extract_lane_u $push80=, $0, 1
; CHECK-NEXT: i32.const $push345=, 1
-; CHECK-NEXT: i32.and $push91=, $pop90, $pop345
-; CHECK-NEXT: i32.const $push344=, 3
-; CHECK-NEXT: i32.shl $push93=, $pop91, $pop344
-; CHECK-NEXT: i32.or $push94=, $pop89, $pop93
-; CHECK-NEXT: i8x16.extract_lane_u $push95=, $0, 4
+; CHECK-NEXT: i32.and $push81=, $pop80, $pop345
+; CHECK-NEXT: i32.const $push344=, 1
+; CHECK-NEXT: i32.shl $push82=, $pop81, $pop344
+; CHECK-NEXT: i32.or $push83=, $pop79, $pop82
+; CHECK-NEXT: i8x16.extract_lane_u $push84=, $0, 2
; CHECK-NEXT: i32.const $push343=, 1
-; CHECK-NEXT: i32.and $push96=, $pop95, $pop343
-; CHECK-NEXT: i32.const $push342=, 4
-; CHECK-NEXT: i32.shl $push98=, $pop96, $pop342
-; CHECK-NEXT: i32.or $push99=, $pop94, $pop98
-; CHECK-NEXT: i8x16.extract_lane_u $push100=, $0, 5
+; CHECK-NEXT: i32.and $push85=, $pop84, $pop343
+; CHECK-NEXT: i32.const $push342=, 2
+; CHECK-NEXT: i32.shl $push86=, $pop85, $pop342
+; CHECK-NEXT: i32.or $push87=, $pop83, $pop86
+; CHECK-NEXT: i8x16.extract_lane_u $push88=, $0, 3
; CHECK-NEXT: i32.const $push341=, 1
-; CHECK-NEXT: i32.and $push101=, $pop100, $pop341
-; CHECK-NEXT: i32.const $push340=, 5
-; CHECK-NEXT: i32.shl $push103=, $pop101, $pop340
-; CHECK-NEXT: i32.or $push104=, $pop99, $pop103
-; CHECK-NEXT: i8x16.extract_lane_u $push105=, $0, 6
+; CHECK-NEXT: i32.and $push89=, $pop88, $pop341
+; CHECK-NEXT: i32.const $push340=, 3
+; CHECK-NEXT: i32.shl $push90=, $pop89, $pop340
+; CHECK-NEXT: i32.or $push91=, $pop87, $pop90
+; CHECK-NEXT: i8x16.extract_lane_u $push92=, $0, 4
; CHECK-NEXT: i32.const $push339=, 1
-; CHECK-NEXT: i32.and $push106=, $pop105, $pop339
-; CHECK-NEXT: i32.const $push338=, 6
-; CHECK-NEXT: i32.shl $push108=, $pop106, $pop338
-; CHECK-NEXT: i32.or $push109=, $pop104, $pop108
-; CHECK-NEXT: i8x16.extract_lane_u $push110=, $0, 7
+; CHECK-NEXT: i32.and $push93=, $pop92, $pop339
+; CHECK-NEXT: i32.const $push338=, 4
+; CHECK-NEXT: i32.shl $push94=, $pop93, $pop338
+; CHECK-NEXT: i32.or $push95=, $pop91, $pop94
+; CHECK-NEXT: i8x16.extract_lane_u $push96=, $0, 5
; CHECK-NEXT: i32.const $push337=, 1
-; CHECK-NEXT: i32.and $push111=, $pop110, $pop337
-; CHECK-NEXT: i32.const $push336=, 7
-; CHECK-NEXT: i32.shl $push113=, $pop111, $pop336
-; CHECK-NEXT: i32.or $push114=, $pop109, $pop113
-; CHECK-NEXT: i8x16.extract_lane_u $push115=, $0, 8
+; CHECK-NEXT: i32.and $push97=, $pop96, $pop337
+; CHECK-NEXT: i32.const $push336=, 5
+; CHECK-NEXT: i32.shl $push99=, $pop97, $pop336
+; CHECK-NEXT: i32.or $push100=, $pop95, $pop99
+; CHECK-NEXT: i8x16.extract_lane_u $push101=, $0, 6
; CHECK-NEXT: i32.const $push335=, 1
-; CHECK-NEXT: i32.and $push116=, $pop115, $pop335
-; CHECK-NEXT: i32.const $push334=, 8
-; CHECK-NEXT: i32.shl $push118=, $pop116, $pop334
-; CHECK-NEXT: i32.or $push119=, $pop114, $pop118
-; CHECK-NEXT: i8x16.extract_lane_u $push120=, $0, 9
+; CHECK-NEXT: i32.and $push102=, $pop101, $pop335
+; CHECK-NEXT: i32.const $push334=, 6
+; CHECK-NEXT: i32.shl $push104=, $pop102, $pop334
+; CHECK-NEXT: i32.or $push105=, $pop100, $pop104
+; CHECK-NEXT: i8x16.extract_lane_u $push106=, $0, 7
; CHECK-NEXT: i32.const $push333=, 1
-; CHECK-NEXT: i32.and $push121=, $pop120, $pop333
-; CHECK-NEXT: i32.const $push332=, 9
-; CHECK-NEXT: i32.shl $push123=, $pop121, $pop332
-; CHECK-NEXT: i32.or $push124=, $pop119, $pop123
-; CHECK-NEXT: i8x16.extract_lane_u $push125=, $0, 10
+; CHECK-NEXT: i32.and $push107=, $pop106, $pop333
+; CHECK-NEXT: i32.const $push332=, 7
+; CHECK-NEXT: i32.shl $push109=, $pop107, $pop332
+; CHECK-NEXT: i32.or $push110=, $pop105, $pop109
+; CHECK-NEXT: i8x16.extract_lane_u $push111=, $0, 8
; CHECK-NEXT: i32.const $push331=, 1
-; CHECK-NEXT: i32.and $push126=, $pop125, $pop331
-; CHECK-NEXT: i32.const $push330=, 10
-; CHECK-NEXT: i32.shl $push128=, $pop126, $pop330
-; CHECK-NEXT: i32.or $push129=, $pop124, $pop128
-; CHECK-NEXT: i8x16.extract_lane_u $push130=, $0, 11
+; CHECK-NEXT: i32.and $push112=, $pop111, $pop331
+; CHECK-NEXT: i32.const $push330=, 8
+; CHECK-NEXT: i32.shl $push114=, $pop112, $pop330
+; CHECK-NEXT: i32.or $push115=, $pop110, $pop114
+; CHECK-NEXT: i8x16.extract_lane_u $push116=, $0, 9
; CHECK-NEXT: i32.const $push329=, 1
-; CHECK-NEXT: i32.and $push131=, $pop130, $pop329
-; CHECK-NEXT: i32.const $push328=, 11
-; CHECK-NEXT: i32.shl $push133=, $pop131, $pop328
-; CHECK-NEXT: i32.or $push134=, $pop129, $pop133
-; CHECK-NEXT: i8x16.extract_lane_u $push135=, $0, 12
+; CHECK-NEXT: i32.and $push117=, $pop116, $pop329
+; CHECK-NEXT: i32.const $push328=, 9
+; CHECK-NEXT: i32.shl $push119=, $pop117, $pop328
+; CHECK-NEXT: i32.or $push120=, $pop115, $pop119
+; CHECK-NEXT: i8x16.extract_lane_u $push121=, $0, 10
; CHECK-NEXT: i32.const $push327=, 1
-; CHECK-NEXT: i32.and $push136=, $pop135, $pop327
-; CHECK-NEXT: i32.const $push326=, 12
-; CHECK-NEXT: i32.shl $push138=, $pop136, $pop326
-; CHECK-NEXT: i32.or $push139=, $pop134, $pop138
-; CHECK-NEXT: i8x16.extract_lane_u $push140=, $0, 13
+; CHECK-NEXT: i32.and $push122=, $pop121, $pop327
+; CHECK-NEXT: i32.const $push326=, 10
+; CHECK-NEXT: i32.shl $push124=, $pop122, $pop326
+; CHECK-NEXT: i32.or $push125=, $pop120, $pop124
+; CHECK-NEXT: i8x16.extract_lane_u $push126=, $0, 11
; CHECK-NEXT: i32.const $push325=, 1
-; CHECK-NEXT: i32.and $push141=, $pop140, $pop325
-; CHECK-NEXT: i32.const $push324=, 13
-; CHECK-NEXT: i32.shl $push143=, $pop141, $pop324
-; CHECK-NEXT: i32.or $push144=, $pop139, $pop143
-; CHECK-NEXT: i8x16.extract_lane_u $push145=, $0, 14
+; CHECK-NEXT: i32.and $push127=, $pop126, $pop325
+; CHECK-NEXT: i32.const $push324=, 11
+; CHECK-NEXT: i32.shl $push129=, $pop127, $pop324
+; CHECK-NEXT: i32.or $push130=, $pop125, $pop129
+; CHECK-NEXT: i8x16.extract_lane_u $push131=, $0, 12
; CHECK-NEXT: i32.const $push323=, 1
-; CHECK-NEXT: i32.and $push146=, $pop145, $pop323
-; CHECK-NEXT: i32.const $push322=, 14
-; CHECK-NEXT: i32.shl $push148=, $pop146, $pop322
-; CHECK-NEXT: i32.or $push149=, $pop144, $pop148
-; CHECK-NEXT: i8x16.extract_lane_u $push150=, $0, 15
-; CHECK-NEXT: i32.const $push321=, 15
-; CHECK-NEXT: i32.shl $push152=, $pop150, $pop321
-; CHECK-NEXT: i32.or $push153=, $pop149, $pop152
-; CHECK-NEXT: i32.const $push320=, 65535
-; CHECK-NEXT: i32.and $push155=, $pop153, $pop320
-; CHECK-NEXT: i8x16.extract_lane_u $push75=, $1, 15
-; CHECK-NEXT: i32.const $push319=, 31
-; CHECK-NEXT: i32.shl $push77=, $pop75, $pop319
-; CHECK-NEXT: i8x16.extract_lane_u $push70=, $1, 14
-; CHECK-NEXT: i32.const $push318=, 1
-; CHECK-NEXT: i32.and $push71=, $pop70, $pop318
-; CHECK-NEXT: i32.const $push317=, 30
-; CHECK-NEXT: i32.shl $push73=, $pop71, $pop317
-; CHECK-NEXT: i8x16.extract_lane_u $push65=, $1, 13
-; CHECK-NEXT: i32.const $push316=, 1
-; CHECK-NEXT: i32.and $push66=, $pop65, $pop316
-; CHECK-NEXT: i32.const $push315=, 29
-; CHECK-NEXT: i32.shl $push68=, $pop66, $pop315
-; CHECK-NEXT: i8x16.extract_lane_u $push60=, $1, 12
+; CHECK-NEXT: i32.and $push132=, $pop131, $pop323
+; CHECK-NEXT: i32.const $push322=, 12
+; CHECK-NEXT: i32.shl $push134=, $pop132, $pop322
+; CHECK-NEXT: i32.or $push135=, $pop130, $pop134
+; CHECK-NEXT: i8x16.extract_lane_u $push136=, $0, 13
+; CHECK-NEXT: i32.const $push321=, 1
+; CHECK-NEXT: i32.and $push137=, $pop136, $pop321
+; CHECK-NEXT: i32.const $push320=, 13
+; CHECK-NEXT: i32.shl $push139=, $pop137, $pop320
+; CHECK-NEXT: i32.or $push140=, $pop135, $pop139
+; CHECK-NEXT: i8x16.extract_lane_u $push141=, $0, 14
+; CHECK-NEXT: i32.const $push319=, 1
+; CHECK-NEXT: i32.and $push142=, $pop141, $pop319
+; CHECK-NEXT: i32.const $push318=, 14
+; CHECK-NEXT: i32.shl $push144=, $pop142, $pop318
+; CHECK-NEXT: i32.or $push145=, $pop140, $pop144
+; CHECK-NEXT: i8x16.extract_lane_u $push146=, $0, 15
+; CHECK-NEXT: i32.const $push317=, 15
+; CHECK-NEXT: i32.shl $push148=, $pop146, $pop317
+; CHECK-NEXT: i32.or $push149=, $pop145, $pop148
+; CHECK-NEXT: i32.const $push316=, 65535
+; CHECK-NEXT: i32.and $push151=, $pop149, $pop316
+; CHECK-NEXT: i8x16.extract_lane_u $push74=, $1, 15
+; CHECK-NEXT: i32.const $push315=, 31
+; CHECK-NEXT: i32.shl $push76=, $pop74, $pop315
+; CHECK-NEXT: i8x16.extract_lane_u $push69=, $1, 14
; CHECK-NEXT: i32.const $push314=, 1
-; CHECK-NEXT: i32.and $push61=, $pop60, $pop314
-; CHECK-NEXT: i32.const $push313=, 28
-; CHECK-NEXT: i32.shl $push63=, $pop61, $pop313
-; CHECK-NEXT: i8x16.extract_lane_u $push55=, $1, 11
+; CHECK-NEXT: i32.and $push70=, $pop69, $pop314
+; CHECK-NEXT: i32.const $push313=, 30
+; CHECK-NEXT: i32.shl $push72=, $pop70, $pop313
+; CHECK-NEXT: i8x16.extract_lane_u $push64=, $1, 13
; CHECK-NEXT: i32.const $push312=, 1
-; CHECK-NEXT: i32.and $push56=, $pop55, $pop312
-; CHECK-NEXT: i32.const $push311=, 27
-; CHECK-NEXT: i32.shl $push58=, $pop56, $pop311
-; CHECK-NEXT: i8x16.extract_lane_u $push50=, $1, 10
+; CHECK-NEXT: i32.and $push65=, $pop64, $pop312
+; CHECK-NEXT: i32.const $push311=, 29
+; CHECK-NEXT: i32.shl $push67=, $pop65, $pop311
+; CHECK-NEXT: i8x16.extract_lane_u $push59=, $1, 12
; CHECK-NEXT: i32.const $push310=, 1
-; CHECK-NEXT: i32.and $push51=, $pop50, $pop310
-; CHECK-NEXT: i32.const $push309=, 26
-; CHECK-NEXT: i32.shl $push53=, $pop51, $pop309
-; CHECK-NEXT: i8x16.extract_lane_u $push45=, $1, 9
+; CHECK-NEXT: i32.and $push60=, $pop59, $pop310
+; CHECK-NEXT: i32.const $push309=, 28
+; CHECK-NEXT: i32.shl $push62=, $pop60, $pop309
+; CHECK-NEXT: i8x16.extract_lane_u $push54=, $1, 11
; CHECK-NEXT: i32.const $push308=, 1
-; CHECK-NEXT: i32.and $push46=, $pop45, $pop308
-; CHECK-NEXT: i32.const $push307=, 25
-; CHECK-NEXT: i32.shl $push48=, $pop46, $pop307
-; CHECK-NEXT: i8x16.extract_lane_u $push40=, $1, 8
+; CHECK-NEXT: i32.and $push55=, $pop54, $pop308
+; CHECK-NEXT: i32.const $push307=, 27
+; CHECK-NEXT: i32.shl $push57=, $pop55, $pop307
+; CHECK-NEXT: i8x16.extract_lane_u $push49=, $1, 10
; CHECK-NEXT: i32.const $push306=, 1
-; CHECK-NEXT: i32.and $push41=, $pop40, $pop306
-; CHECK-NEXT: i32.const $push305=, 24
-; CHECK-NEXT: i32.shl $push43=, $pop41, $pop305
-; CHECK-NEXT: i8x16.extract_lane_u $push35=, $1, 7
+; CHECK-NEXT: i32.and $push50=, $pop49, $pop306
+; CHECK-NEXT: i32.const $push305=, 26
+; CHECK-NEXT: i32.shl $push52=, $pop50, $pop305
+; CHECK-NEXT: i8x16.extract_lane_u $push44=, $1, 9
; CHECK-NEXT: i32.const $push304=, 1
-; CHECK-NEXT: i32.and $push36=, $pop35, $pop304
-; CHECK-NEXT: i32.const $push303=, 23
-; CHECK-NEXT: i32.shl $push38=, $pop36, $pop303
-; CHECK-NEXT: i8x16.extract_lane_u $push30=, $1, 6
+; CHECK-NEXT: i32.and $push45=, $pop44, $pop304
+; CHECK-NEXT: i32.const $push303=, 25
+; CHECK-NEXT: i32.shl $push47=, $pop45, $pop303
+; CHECK-NEXT: i8x16.extract_lane_u $push39=, $1, 8
; CHECK-NEXT: i32.const $push302=, 1
-; CHECK-NEXT: i32.and $push31=, $pop30, $pop302
-; CHECK-NEXT: i32.const $push301=, 22
-; CHECK-NEXT: i32.shl $push33=, $pop31, $pop301
-; CHECK-NEXT: i8x16.extract_lane_u $push25=, $1, 5
+; CHECK-NEXT: i32.and $push40=, $pop39, $pop302
+; CHECK-NEXT: i32.const $push301=, 24
+; CHECK-NEXT: i32.shl $push42=, $pop40, $pop301
+; CHECK-NEXT: i8x16.extract_lane_u $push34=, $1, 7
; CHECK-NEXT: i32.const $push300=, 1
-; CHECK-NEXT: i32.and $push26=, $pop25, $pop300
-; CHECK-NEXT: i32.const $push299=, 21
-; CHECK-NEXT: i32.shl $push28=, $pop26, $pop299
-; CHECK-NEXT: i8x16.extract_lane_u $push20=, $1, 4
+; CHECK-NEXT: i32.and $push35=, $pop34, $pop300
+; CHECK-NEXT: i32.const $push299=, 23
+; CHECK-NEXT: i32.shl $push37=, $pop35, $pop299
+; CHECK-NEXT: i8x16.extract_lane_u $push29=, $1, 6
; CHECK-NEXT: i32.const $push298=, 1
-; CHECK-NEXT: i32.and $push21=, $pop20, $pop298
-; CHECK-NEXT: i32.const $push297=, 20
-; CHECK-NEXT: i32.shl $push23=, $pop21, $pop297
-; CHECK-NEXT: i8x16.extract_lane_u $push15=, $1, 3
+; CHECK-NEXT: i32.and $push30=, $pop29, $pop298
+; CHECK-NEXT: i32.const $push297=, 22
+; CHECK-NEXT: i32.shl $push32=, $pop30, $pop297
+; CHECK-NEXT: i8x16.extract_lane_u $push24=, $1, 5
; CHECK-NEXT: i32.const $push296=, 1
-; CHECK-NEXT: i32.and $push16=, $pop15, $pop296
-; CHECK-NEXT: i32.const $push295=, 19
-; CHECK-NEXT: i32.shl $push18=, $pop16, $pop295
-; CHECK-NEXT: i8x16.extract_lane_u $push10=, $1, 2
+; CHECK-NEXT: i32.and $push25=, $pop24, $pop296
+; CHECK-NEXT: i32.const $push295=, 21
+; CHECK-NEXT: i32.shl $push27=, $pop25, $pop295
+; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 0
; CHECK-NEXT: i32.const $push294=, 1
-; CHECK-NEXT: i32.and $push11=, $pop10, $pop294
-; CHECK-NEXT: i32.const $push293=, 18
-; CHECK-NEXT: i32.shl $push13=, $pop11, $pop293
-; CHECK-NEXT: i8x16.extract_lane_u $push5=, $1, 1
+; CHECK-NEXT: i32.and $push2=, $pop0, $pop294
+; CHECK-NEXT: i8x16.extract_lane_u $push3=, $1, 1
+; CHECK-NEXT: i32.const $push293=, 1
+; CHECK-NEXT: i32.and $push4=, $pop3, $pop293
; CHECK-NEXT: i32.const $push292=, 1
-; CHECK-NEXT: i32.and $push6=, $pop5, $pop292
-; CHECK-NEXT: i32.const $push291=, 17
-; CHECK-NEXT: i32.shl $push8=, $pop6, $pop291
-; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 0
-; CHECK-NEXT: i32.const $push290=, 1
-; CHECK-NEXT: i32.and $push2=, $pop0, $pop290
-; CHECK-NEXT: i32.const $push289=, 16
-; CHECK-NEXT: i32.shl $push4=, $pop2, $pop289
-; CHECK-NEXT: i32.or $push9=, $pop8, $pop4
-; CHECK-NEXT: i32.or $push14=, $pop13, $pop9
-; CHECK-NEXT: i32.or $push19=, $pop18, $pop14
-; CHECK-NEXT: i32.or $push24=, $pop23, $pop19
-; CHECK-NEXT: i32.or $push29=, $pop28, $pop24
-; CHECK-NEXT: i32.or $push34=, $pop33, $pop29
-; CHECK-NEXT: i32.or $push39=, $pop38, $pop34
-; CHECK-NEXT: i32.or $push44=, $pop43, $pop39
-; CHECK-NEXT: i32.or $push49=, $pop48, $pop44
-; CHECK-NEXT: i32.or $push54=, $pop53, $pop49
-; CHECK-NEXT: i32.or $push59=, $pop58, $pop54
-; CHECK-NEXT: i32.or $push64=, $pop63, $pop59
-; CHECK-NEXT: i32.or $push69=, $pop68, $pop64
-; CHECK-NEXT: i32.or $push74=, $pop73, $pop69
-; CHECK-NEXT: i32.or $push78=, $pop77, $pop74
-; CHECK-NEXT: i32.or $push156=, $pop155, $pop78
-; CHECK-NEXT: i64.extend_i32_u $push157=, $pop156
-; CHECK-NEXT: i64.and $push284=, $pop283, $pop157
-; CHECK-NEXT: i64.const $push285=, 4294967295
-; CHECK-NEXT: i64.eq $push286=, $pop284, $pop285
-; CHECK-NEXT: return $pop286
+; CHECK-NEXT: i32.shl $push5=, $pop4, $pop292
+; CHECK-NEXT: i32.or $push6=, $pop2, $pop5
+; CHECK-NEXT: i8x16.extract_lane_u $push7=, $1, 2
+; CHECK-NEXT: i32.const $push291=, 1
+; CHECK-NEXT: i32.and $push8=, $pop7, $pop291
+; CHECK-NEXT: i32.const $push290=, 2
+; CHECK-NEXT: i32.shl $push10=, $pop8, $pop290
+; CHECK-NEXT: i32.or $push11=, $pop6, $pop10
+; CHECK-NEXT: i8x16.extract_lane_u $push12=, $1, 3
+; CHECK-NEXT: i32.const $push289=, 1
+; CHECK-NEXT: i32.and $push13=, $pop12, $pop289
+; CHECK-NEXT: i32.const $push288=, 3
+; CHECK-NEXT: i32.shl $push15=, $pop13, $pop288
+; CHECK-NEXT: i32.or $push16=, $pop11, $pop15
+; CHECK-NEXT: i8x16.extract_lane_u $push17=, $1, 4
+; CHECK-NEXT: i32.const $push287=, 1
+; CHECK-NEXT: i32.and $push18=, $pop17, $pop287
+; CHECK-NEXT: i32.const $push286=, 4
+; CHECK-NEXT: i32.shl $push20=, $pop18, $pop286
+; CHECK-NEXT: i32.or $push21=, $pop16, $pop20
+; CHECK-NEXT: i32.const $push285=, 16
+; CHECK-NEXT: i32.shl $push23=, $pop21, $pop285
+; CHECK-NEXT: i32.or $push28=, $pop27, $pop23
+; CHECK-NEXT: i32.or $push33=, $pop32, $pop28
+; CHECK-NEXT: i32.or $push38=, $pop37, $pop33
+; CHECK-NEXT: i32.or $push43=, $pop42, $pop38
+; CHECK-NEXT: i32.or $push48=, $pop47, $pop43
+; CHECK-NEXT: i32.or $push53=, $pop52, $pop48
+; CHECK-NEXT: i32.or $push58=, $pop57, $pop53
+; CHECK-NEXT: i32.or $push63=, $pop62, $pop58
+; CHECK-NEXT: i32.or $push68=, $pop67, $pop63
+; CHECK-NEXT: i32.or $push73=, $pop72, $pop68
+; CHECK-NEXT: i32.or $push77=, $pop76, $pop73
+; CHECK-NEXT: i32.or $push152=, $pop151, $pop77
+; CHECK-NEXT: i64.extend_i32_u $push153=, $pop152
+; CHECK-NEXT: i64.and $push280=, $pop279, $pop153
+; CHECK-NEXT: i64.const $push281=, 4294967295
+; CHECK-NEXT: i64.eq $push282=, $pop280, $pop281
+; CHECK-NEXT: return $pop282
%bits = trunc <64 x i8> %x to <64 x i1>
%ret = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %bits)
ret i1 %ret
diff --git a/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll b/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
index 0d63779227554c..b24fb6f016b12a 100644
--- a/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
+++ b/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
@@ -24,7 +24,12 @@ define void @passing2(i64 %str.0, i64 %str.1, i16 signext %s, i32 %j, i8 signex
; CHECK-NEXT: movw %r9w, {{[0-9]+}}(%rsp)
; CHECK-NEXT: shll $14, %edi
; CHECK-NEXT: sarl $23, %edi
-; CHECK-NEXT: cmpl %ecx, %edi
+; CHECK-NEXT: shlw $7, %di
+; CHECK-NEXT: sarw $7, %di
+; CHECK-NEXT: shlw $7, %di
+; CHECK-NEXT: sarw $7, %di
+; CHECK-NEXT: movswl %di, %eax
+; CHECK-NEXT: cmpl %ecx, %eax
; CHECK-NEXT: jne LBB0_6
; CHECK-NEXT: ## %bb.1: ## %bb27
; CHECK-NEXT: movb {{[0-9]+}}(%rsp), %al
@@ -33,15 +38,21 @@ define void @passing2(i64 %str.0, i64 %str.1, i16 signext %s, i32 %j, i8 signex
; CHECK-NEXT: ## %bb.2: ## %bb35
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: shll $7, %eax
-; CHECK-NEXT: cwtl
-; CHECK-NEXT: shrl $7, %eax
+; CHECK-NEXT: sarw $7, %ax
+; CHECK-NEXT: shlw $7, %ax
+; CHECK-NEXT: sarw $7, %ax
+; CHECK-NEXT: shlw $7, %ax
+; CHECK-NEXT: sarw $7, %ax
; CHECK-NEXT: cmpw {{[0-9]+}}(%rsp), %ax
; CHECK-NEXT: jne LBB0_6
; CHECK-NEXT: ## %bb.3: ## %bb51
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: shll $7, %eax
-; CHECK-NEXT: cwtl
-; CHECK-NEXT: shrl $7, %eax
+; CHECK-NEXT: sarw $7, %ax
+; CHECK-NEXT: shlw $7, %ax
+; CHECK-NEXT: sarw $7, %ax
+; CHECK-NEXT: shlw $7, %ax
+; CHECK-NEXT: sarw $7, %ax
; CHECK-NEXT: cmpw {{[0-9]+}}(%rsp), %ax
; CHECK-NEXT: jne LBB0_6
; CHECK-NEXT: ## %bb.4: ## %bb67
diff --git a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
index 50e736ac68d29e..1fb6d05abb701d 100644
--- a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
@@ -19,9 +19,9 @@ target triple = "x86_64-unknown-linux-gnu"
define i64 @foo(i64 %b) nounwind readnone {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movsbq %dil, %rax
-; CHECK-NEXT: shlq $8, %rax
-; CHECK-NEXT: incq %rax
+; CHECK-NEXT: shlq $56, %rdi
+; CHECK-NEXT: sarq $48, %rdi
+; CHECK-NEXT: leaq 1(%rdi), %rax
; CHECK-NEXT: retq
entry:
%shl = shl i64 %b, 56 ; <i64> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/add-ext.ll b/llvm/test/CodeGen/X86/add-ext.ll
index 12df378af3de3e..af1c8a5ac3940d 100644
--- a/llvm/test/CodeGen/X86/add-ext.ll
+++ b/llvm/test/CodeGen/X86/add-ext.ll
@@ -127,8 +127,9 @@ define ptr @gep128(i32 %i, ptr %x) {
; CHECK-LABEL: gep128:
; CHECK: # %bb.0:
; CHECK-NEXT: movslq %edi, %rax
+; CHECK-NEXT: addq $5, %rax
; CHECK-NEXT: shlq $4, %rax
-; CHECK-NEXT: leaq 80(%rax,%rsi), %rax
+; CHECK-NEXT: addq %rsi, %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
diff --git a/llvm/test/CodeGen/X86/addr-mode-matcher-2.ll b/llvm/test/CodeGen/X86/addr-mode-matcher-2.ll
index daba729bf040f2..f45caeb42d65d0 100644
--- a/llvm/test/CodeGen/X86/addr-mode-matcher-2.ll
+++ b/llvm/test/CodeGen/X86/addr-mode-matcher-2.ll
@@ -52,8 +52,8 @@ define void @foo_sext_nsw(i1 zeroext, i32) nounwind {
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1
; X64-NEXT: cltq
-; X64-NEXT: shlq $2, %rax
-; X64-NEXT: leaq 20(%rax,%rax,4), %rdi
+; X64-NEXT: leaq 4(,%rax,4), %rax
+; X64-NEXT: leaq (%rax,%rax,4), %rdi
; X64-NEXT: callq bar at PLT
; X64-NEXT: jmp .LBB0_2
br i1 %0, label %9, label %3
@@ -195,8 +195,8 @@ define void @foo_zext_nuw(i1 zeroext, i32) nounwind {
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB3_2: # =>This Inner Loop Header: Depth=1
; X64-NEXT: movl %eax, %eax
-; X64-NEXT: shlq $2, %rax
-; X64-NEXT: leaq 20(%rax,%rax,4), %rdi
+; X64-NEXT: leaq 4(,%rax,4), %rax
+; X64-NEXT: leaq (%rax,%rax,4), %rdi
; X64-NEXT: callq bar at PLT
; X64-NEXT: jmp .LBB3_2
br i1 %0, label %9, label %3
diff --git a/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll b/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
index daa521d3917cdf..605265fc0ecf4a 100644
--- a/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
+++ b/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
@@ -50,10 +50,11 @@ define i32 @mask_add_zext_i32_i64(ptr %base, i32 %i) {
define i32 @mask_offset_scale_zext_i32_i64(ptr %base, i32 %i) {
; X86-LABEL: mask_offset_scale_zext_i32_i64:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shll $11, %ecx
-; X86-NEXT: movl 48(%eax,%ecx), %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: leal 12(%eax,%eax), %eax
+; X86-NEXT: movl (%ecx,%eax,4), %eax
; X86-NEXT: retl
;
; X64-LABEL: mask_offset_scale_zext_i32_i64:
diff --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll
index f39c4b5e620d0e..28984bd18a7dee 100644
--- a/llvm/test/CodeGen/X86/atomic-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll
@@ -34,7 +34,7 @@ define i16 @bts2() nounwind {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: lock btsw $1, v16
; X86-NEXT: setb %al
-; X86-NEXT: addl %eax, %eax
+; X86-NEXT: addw %ax, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -43,7 +43,7 @@ define i16 @bts2() nounwind {
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: lock btsw $1, v16(%rip)
; X64-NEXT: setb %al
-; X64-NEXT: addl %eax, %eax
+; X64-NEXT: addw %ax, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -58,7 +58,7 @@ define i16 @bts15() nounwind {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: lock btsw $15, v16
; X86-NEXT: setb %al
-; X86-NEXT: shll $15, %eax
+; X86-NEXT: shlw $15, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -67,7 +67,7 @@ define i16 @bts15() nounwind {
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: lock btsw $15, v16(%rip)
; X64-NEXT: setb %al
-; X64-NEXT: shll $15, %eax
+; X64-NEXT: shlw $15, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -162,7 +162,7 @@ define i16 @btc2() nounwind {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: lock btcw $1, v16
; X86-NEXT: setb %al
-; X86-NEXT: addl %eax, %eax
+; X86-NEXT: addw %ax, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -171,7 +171,7 @@ define i16 @btc2() nounwind {
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: lock btcw $1, v16(%rip)
; X64-NEXT: setb %al
-; X64-NEXT: addl %eax, %eax
+; X64-NEXT: addw %ax, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -286,7 +286,7 @@ define i16 @btr2() nounwind {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: lock btrw $1, v16
; X86-NEXT: setb %al
-; X86-NEXT: addl %eax, %eax
+; X86-NEXT: addw %ax, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -295,7 +295,7 @@ define i16 @btr2() nounwind {
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: lock btrw $1, v16(%rip)
; X64-NEXT: setb %al
-; X64-NEXT: addl %eax, %eax
+; X64-NEXT: addw %ax, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -310,7 +310,7 @@ define i16 @btr15() nounwind {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: lock btrw $15, v16
; X86-NEXT: setb %al
-; X86-NEXT: shll $15, %eax
+; X86-NEXT: shlw $15, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -319,7 +319,7 @@ define i16 @btr15() nounwind {
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: lock btrw $15, v16(%rip)
; X64-NEXT: setb %al
-; X64-NEXT: shll $15, %eax
+; X64-NEXT: shlw $15, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -443,19 +443,22 @@ entry:
define i16 @multi_use2() nounwind {
; X86-LABEL: multi_use2:
; X86: # %bb.0: # %entry
-; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: lock btsw $0, v16
-; X86-NEXT: setb %al
-; X86-NEXT: leal (%eax,%eax,2), %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addw %ax, %ax
+; X86-NEXT: orl %ecx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: multi_use2:
; X64: # %bb.0: # %entry
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: lock btsw $0, v16(%rip)
-; X64-NEXT: setb %al
-; X64-NEXT: leal (%rax,%rax,2), %eax
+; X64-NEXT: setb %cl
+; X64-NEXT: leal (%rcx,%rcx), %eax
+; X64-NEXT: orl %ecx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
index 8ff4f4067dabda..ad85a090010f8e 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
@@ -1392,8 +1392,11 @@ return: ; preds = %entry, %if.then
define i64 @atomic_shl1_xor_64_const_br(ptr %v) nounwind {
; CHECK-LABEL: atomic_shl1_xor_64_const_br:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: lock btcq $4, (%rdi)
-; CHECK-NEXT: jae .LBB48_1
+; CHECK-NEXT: setb %al
+; CHECK-NEXT: shlq $4, %rax
+; CHECK-NEXT: je .LBB48_1
; CHECK-NEXT: # %bb.2: # %if.then
; CHECK-NEXT: movq 32(%rdi), %rax
; CHECK-NEXT: retq
@@ -1455,9 +1458,12 @@ return: ; preds = %entry, %if.then
define i64 @atomic_shl1_xor_64_const_brz(ptr %v) nounwind {
; CHECK-LABEL: atomic_shl1_xor_64_const_brz:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: lock btcq $4, (%rdi)
+; CHECK-NEXT: setb %al
+; CHECK-NEXT: shlq $4, %rax
; CHECK-NEXT: movl $123, %eax
-; CHECK-NEXT: jae .LBB50_1
+; CHECK-NEXT: je .LBB50_1
; CHECK-NEXT: # %bb.2: # %return
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB50_1: # %if.then
@@ -1518,8 +1524,11 @@ return: ; preds = %entry, %if.then
define i64 @atomic_shl1_xor_64_const_brnz(ptr %v) nounwind {
; CHECK-LABEL: atomic_shl1_xor_64_const_brnz:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: lock btcq $4, (%rdi)
-; CHECK-NEXT: jae .LBB52_1
+; CHECK-NEXT: setb %al
+; CHECK-NEXT: shlq $4, %rax
+; CHECK-NEXT: je .LBB52_1
; CHECK-NEXT: # %bb.2: # %if.then
; CHECK-NEXT: movq 32(%rdi), %rax
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
index 5594d13a234d02..54151466bffcc2 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
@@ -764,7 +764,7 @@ define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_val(ptr %v, i16 zeroext %c
; X86-NEXT: lock btcw %cx, (%edx)
; X86-NEXT: setb %al
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
+; X86-NEXT: shlw %cl, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -776,7 +776,7 @@ define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_val(ptr %v, i16 zeroext %c
; X64-NEXT: lock btcw %cx, (%rdi)
; X64-NEXT: setb %al
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: shll %cl, %eax
+; X64-NEXT: shlw %cl, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -796,8 +796,8 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_val(ptr %v, i16 zeroext %c) nou
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: andb $15, %cl
-; X86-NEXT: movl $1, %edi
-; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movw $1, %di
+; X86-NEXT: shlw %cl, %di
; X86-NEXT: movzwl (%esi), %eax
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB14_1: # %atomicrmw.start
@@ -822,8 +822,8 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_val(ptr %v, i16 zeroext %c) nou
; X64: # %bb.0: # %entry
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: andb $15, %cl
-; X64-NEXT: movl $1, %edx
-; X64-NEXT: shll %cl, %edx
+; X64-NEXT: movw $1, %dx
+; X64-NEXT: shlw %cl, %dx
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB14_1: # %atomicrmw.start
@@ -873,9 +873,9 @@ define zeroext i16 @atomic_shl1_mask1_xor_16_gpr_val(ptr %v, i16 zeroext %c) nou
; X86-NEXT: jne .LBB15_1
; X86-NEXT: # %bb.2: # %atomicrmw.end
; X86-NEXT: andb $15, %cl
-; X86-NEXT: movl $1, %edx
+; X86-NEXT: movw $1, %dx
; X86-NEXT: # kill: def $cl killed $cl killed $cx
-; X86-NEXT: shll %cl, %edx
+; X86-NEXT: shlw %cl, %dx
; X86-NEXT: andl %edx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: popl %esi
@@ -899,9 +899,9 @@ define zeroext i16 @atomic_shl1_mask1_xor_16_gpr_val(ptr %v, i16 zeroext %c) nou
; X64-NEXT: jne .LBB15_1
; X64-NEXT: # %bb.2: # %atomicrmw.end
; X64-NEXT: andb $15, %cl
-; X64-NEXT: movl $1, %edx
+; X64-NEXT: movw $1, %dx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: shll %cl, %edx
+; X64-NEXT: shlw %cl, %dx
; X64-NEXT: andl %edx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
@@ -926,7 +926,7 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_val(ptr %v, i16 zeroext %c) no
; X86-NEXT: lock btcw %cx, (%edx)
; X86-NEXT: setb %al
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
+; X86-NEXT: shlw %cl, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -938,7 +938,7 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_val(ptr %v, i16 zeroext %c) no
; X64-NEXT: lock btcw %cx, (%rdi)
; X64-NEXT: setb %al
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: shll %cl, %eax
+; X64-NEXT: shlw %cl, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -1140,8 +1140,8 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_valz(ptr %v, i16 zeroext %c) no
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: andb $15, %cl
-; X86-NEXT: movl $1, %edi
-; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movw $1, %di
+; X86-NEXT: shlw %cl, %di
; X86-NEXT: movzwl (%esi), %eax
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB20_1: # %atomicrmw.start
@@ -1166,8 +1166,8 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_valz(ptr %v, i16 zeroext %c) no
; X64: # %bb.0: # %entry
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: andb $15, %cl
-; X64-NEXT: movl $1, %edx
-; X64-NEXT: shll %cl, %edx
+; X64-NEXT: movw $1, %dx
+; X64-NEXT: shlw %cl, %dx
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB20_1: # %atomicrmw.start
@@ -1533,8 +1533,8 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_valnz(ptr %v, i16 zeroext %c) n
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: andb $15, %cl
-; X86-NEXT: movl $1, %edi
-; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movw $1, %di
+; X86-NEXT: shlw %cl, %di
; X86-NEXT: movzwl (%esi), %eax
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB26_1: # %atomicrmw.start
@@ -1559,8 +1559,8 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_valnz(ptr %v, i16 zeroext %c) n
; X64: # %bb.0: # %entry
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: andb $15, %cl
-; X64-NEXT: movl $1, %edx
-; X64-NEXT: shll %cl, %edx
+; X64-NEXT: movw $1, %dx
+; X64-NEXT: shlw %cl, %dx
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB26_1: # %atomicrmw.start
@@ -1959,8 +1959,8 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_brz(ptr %v, i16 zeroext %c) nou
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: andb $15, %cl
-; X86-NEXT: movl $1, %esi
-; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movw $1, %si
+; X86-NEXT: shlw %cl, %si
; X86-NEXT: movzwl (%edx), %eax
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB32_1: # %atomicrmw.start
@@ -1988,8 +1988,8 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_brz(ptr %v, i16 zeroext %c) nou
; X64: # %bb.0: # %entry
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: andb $15, %cl
-; X64-NEXT: movl $1, %edx
-; X64-NEXT: shll %cl, %edx
+; X64-NEXT: movw $1, %dx
+; X64-NEXT: shlw %cl, %dx
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB32_1: # %atomicrmw.start
@@ -2353,7 +2353,7 @@ define zeroext i16 @atomic_shl1_small_mask_and_16_gpr_val(ptr %v, i16 zeroext %c
; X86-NEXT: lock btrw %cx, (%edx)
; X86-NEXT: setb %al
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
+; X86-NEXT: shlw %cl, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -2365,7 +2365,7 @@ define zeroext i16 @atomic_shl1_small_mask_and_16_gpr_val(ptr %v, i16 zeroext %c
; X64-NEXT: lock btrw %cx, (%rdi)
; X64-NEXT: setb %al
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: shll %cl, %eax
+; X64-NEXT: shlw %cl, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -2461,9 +2461,9 @@ define zeroext i16 @atomic_shl1_mask1_and_16_gpr_val(ptr %v, i16 zeroext %c) nou
; X86-NEXT: jne .LBB39_1
; X86-NEXT: # %bb.2: # %atomicrmw.end
; X86-NEXT: andb $15, %cl
-; X86-NEXT: movl $1, %edx
+; X86-NEXT: movw $1, %dx
; X86-NEXT: # kill: def $cl killed $cl killed $cx
-; X86-NEXT: shll %cl, %edx
+; X86-NEXT: shlw %cl, %dx
; X86-NEXT: andl %edx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: popl %esi
@@ -2487,9 +2487,9 @@ define zeroext i16 @atomic_shl1_mask1_and_16_gpr_val(ptr %v, i16 zeroext %c) nou
; X64-NEXT: jne .LBB39_1
; X64-NEXT: # %bb.2: # %atomicrmw.end
; X64-NEXT: andb $15, %cl
-; X64-NEXT: movl $1, %edx
+; X64-NEXT: movw $1, %dx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: shll %cl, %edx
+; X64-NEXT: shlw %cl, %dx
; X64-NEXT: andl %edx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
@@ -2515,7 +2515,7 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_val(ptr %v, i16 zeroext %c) no
; X86-NEXT: lock btrw %cx, (%edx)
; X86-NEXT: setb %al
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
+; X86-NEXT: shlw %cl, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -2527,7 +2527,7 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_val(ptr %v, i16 zeroext %c) no
; X64-NEXT: lock btrw %cx, (%rdi)
; X64-NEXT: setb %al
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: shll %cl, %eax
+; X64-NEXT: shlw %cl, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -3544,7 +3544,7 @@ define zeroext i16 @atomic_shl1_or_16_const_val(ptr %v) nounwind {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: lock btsw $4, (%ecx)
; X86-NEXT: setb %al
-; X86-NEXT: shll $4, %eax
+; X86-NEXT: shlw $4, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -3553,7 +3553,7 @@ define zeroext i16 @atomic_shl1_or_16_const_val(ptr %v) nounwind {
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: lock btsw $4, (%rdi)
; X64-NEXT: setb %al
-; X64-NEXT: shll $4, %eax
+; X64-NEXT: shlw $4, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -3652,7 +3652,7 @@ define zeroext i16 @atomic_shl1_and_16_const_val(ptr %v) nounwind {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: lock btrw $4, (%ecx)
; X86-NEXT: setb %al
-; X86-NEXT: shll $4, %eax
+; X86-NEXT: shlw $4, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -3661,7 +3661,7 @@ define zeroext i16 @atomic_shl1_and_16_const_val(ptr %v) nounwind {
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: lock btrw $4, (%rdi)
; X64-NEXT: setb %al
-; X64-NEXT: shll $4, %eax
+; X64-NEXT: shlw $4, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index b39b089faa2a5e..8cd1af25819234 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -681,12 +681,12 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; KNL-NEXT: pushq %rbx
; KNL-NEXT: xorl %r10d, %r10d
; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp)
-; KNL-NEXT: movl $65535, %eax ## imm = 0xFFFF
+; KNL-NEXT: movl $65535, %ebx ## imm = 0xFFFF
; KNL-NEXT: movl $0, %r11d
-; KNL-NEXT: cmovnel %eax, %r11d
+; KNL-NEXT: cmovnel %ebx, %r11d
; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp)
-; KNL-NEXT: cmovnel %eax, %r10d
; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: cmovnel %ebx, %r10d
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
; KNL-NEXT: andl $1, %edi
; KNL-NEXT: kmovw %edi, %k0
@@ -939,6 +939,8 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; KNL-NEXT: kmovw %k1, %r13d
; KNL-NEXT: kshiftrw $14, %k0, %k1
; KNL-NEXT: andl $1, %edx
+; KNL-NEXT: shll $16, %edx
+; KNL-NEXT: shrl $16, %edx
; KNL-NEXT: movb %dl, 2(%rax)
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: andl $1, %edx
@@ -1004,19 +1006,19 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; SKX-NEXT: pushq %r13
; SKX-NEXT: pushq %r12
; SKX-NEXT: pushq %rbx
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0
; SKX-NEXT: movq %rdi, %rax
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0
; SKX-NEXT: kshiftld $31, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
; SKX-NEXT: kshiftrd $30, %k0, %k0
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $31, %k1, %k1
; SKX-NEXT: kord %k0, %k1, %k0
; SKX-NEXT: movl $-5, %edi
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
+; SKX-NEXT: kmovd %edi, %k2
+; SKX-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT: kandd %k2, %k0, %k0
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $29, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
@@ -1029,18 +1031,18 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; SKX-NEXT: kshiftrd $28, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: movl $-17, %edi
-; SKX-NEXT: kmovd %edi, %k2
-; SKX-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT: kandd %k1, %k0, %k0
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kandd %k2, %k0, %k0
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $27, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: movl $-33, %edi
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
+; SKX-NEXT: kmovd %edi, %k2
+; SKX-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT: kandd %k2, %k0, %k0
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $26, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
@@ -1053,18 +1055,18 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; SKX-NEXT: kshiftrd $25, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: movl $-129, %edi
-; SKX-NEXT: kmovd %edi, %k2
-; SKX-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT: kandd %k1, %k0, %k0
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kandd %k2, %k0, %k0
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $24, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: movl $-257, %edi ## imm = 0xFEFF
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
+; SKX-NEXT: kmovd %edi, %k2
+; SKX-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT: kandd %k2, %k0, %k0
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $23, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
@@ -1077,145 +1079,145 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; SKX-NEXT: kshiftrd $22, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: movl $-1025, %edi ## imm = 0xFBFF
-; SKX-NEXT: kmovd %edi, %k2
-; SKX-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT: kandd %k1, %k0, %k0
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kandd %k2, %k0, %k0
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $21, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: movl $-2049, %edi ## imm = 0xF7FF
-; SKX-NEXT: kmovd %edi, %k6
-; SKX-NEXT: kandd %k6, %k0, %k0
+; SKX-NEXT: kmovd %edi, %k2
+; SKX-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT: kandd %k2, %k0, %k0
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $20, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: movl $-4097, %edi ## imm = 0xEFFF
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
+; SKX-NEXT: kmovd %edi, %k3
+; SKX-NEXT: kandd %k3, %k0, %k0
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $19, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: movl $-8193, %edi ## imm = 0xDFFF
-; SKX-NEXT: kmovd %edi, %k5
+; SKX-NEXT: kmovd %edi, %k4
+; SKX-NEXT: kandd %k4, %k0, %k0
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kandd %k5, %k0, %k0
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $18, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: movl $-16385, %edi ## imm = 0xBFFF
-; SKX-NEXT: kmovd %edi, %k4
-; SKX-NEXT: kandd %k4, %k0, %k0
+; SKX-NEXT: kmovd %edi, %k5
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT: kandd %k5, %k0, %k0
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $17, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: movl $-32769, %edi ## imm = 0xFFFF7FFF
-; SKX-NEXT: kmovd %edi, %k3
-; SKX-NEXT: kandd %k3, %k0, %k0
+; SKX-NEXT: kmovd %edi, %k2
+; SKX-NEXT: kandd %k2, %k0, %k0
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7
; SKX-NEXT: kshiftld $31, %k7, %k7
; SKX-NEXT: kshiftrd $16, %k7, %k7
; SKX-NEXT: kord %k7, %k0, %k7
; SKX-NEXT: movl $-65537, %edi ## imm = 0xFFFEFFFF
-; SKX-NEXT: kmovd %edi, %k2
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0
-; SKX-NEXT: kandd %k2, %k7, %k7
-; SKX-NEXT: kshiftld $31, %k0, %k0
-; SKX-NEXT: kshiftrd $15, %k0, %k0
-; SKX-NEXT: kord %k0, %k7, %k0
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: kandd %k1, %k7, %k7
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6
+; SKX-NEXT: kshiftld $31, %k6, %k6
+; SKX-NEXT: kshiftrd $15, %k6, %k6
+; SKX-NEXT: kord %k6, %k7, %k0
; SKX-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kmovd %edx, %k0
-; SKX-NEXT: kshiftld $31, %k0, %k0
-; SKX-NEXT: kshiftrd $30, %k0, %k0
+; SKX-NEXT: kmovd %edx, %k6
+; SKX-NEXT: kshiftld $31, %k6, %k6
+; SKX-NEXT: kshiftrd $30, %k6, %k6
; SKX-NEXT: kmovd %esi, %k7
; SKX-NEXT: kshiftld $31, %k7, %k7
; SKX-NEXT: kshiftrd $31, %k7, %k7
-; SKX-NEXT: kord %k0, %k7, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
+; SKX-NEXT: kord %k6, %k7, %k6
+; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; SKX-NEXT: kandd %k0, %k6, %k6
; SKX-NEXT: kmovd %ecx, %k7
; SKX-NEXT: kshiftld $31, %k7, %k7
; SKX-NEXT: kshiftrd $29, %k7, %k7
-; SKX-NEXT: kord %k7, %k0, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
+; SKX-NEXT: kord %k7, %k6, %k6
+; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; SKX-NEXT: kandd %k0, %k6, %k6
; SKX-NEXT: kmovd %r8d, %k7
; SKX-NEXT: kshiftld $31, %k7, %k7
; SKX-NEXT: kshiftrd $28, %k7, %k7
-; SKX-NEXT: kord %k7, %k0, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
+; SKX-NEXT: kord %k7, %k6, %k6
+; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; SKX-NEXT: kandd %k0, %k6, %k6
; SKX-NEXT: kmovd %r9d, %k7
; SKX-NEXT: kshiftld $31, %k7, %k7
; SKX-NEXT: kshiftrd $27, %k7, %k7
-; SKX-NEXT: kord %k7, %k0, %k0
+; SKX-NEXT: kord %k7, %k6, %k6
+; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; SKX-NEXT: kandd %k0, %k6, %k6
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k1
; SKX-NEXT: kshiftld $31, %k7, %k7
; SKX-NEXT: kshiftrd $26, %k7, %k7
+; SKX-NEXT: kord %k7, %k6, %k6
+; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; SKX-NEXT: kandd %k0, %k6, %k6
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT: kshiftld $31, %k7, %k7
+; SKX-NEXT: kshiftrd $25, %k7, %k7
+; SKX-NEXT: kord %k7, %k6, %k6
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; SKX-NEXT: kandd %k0, %k6, %k6
+; SKX-NEXT: kshiftld $31, %k7, %k7
+; SKX-NEXT: kshiftrd $24, %k7, %k7
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0
-; SKX-NEXT: kord %k7, %k1, %k1
+; SKX-NEXT: kord %k7, %k6, %k6
; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload
-; SKX-NEXT: kandd %k7, %k1, %k1
+; SKX-NEXT: kandd %k7, %k6, %k6
; SKX-NEXT: kshiftld $31, %k0, %k0
-; SKX-NEXT: kshiftrd $25, %k0, %k0
-; SKX-NEXT: kord %k0, %k1, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $24, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $23, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $22, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $21, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT: kshiftrd $23, %k0, %k0
+; SKX-NEXT: kord %k0, %k6, %k0
+; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 4-byte Reload
; SKX-NEXT: kandd %k6, %k0, %k0
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $20, %k1, %k1
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kshiftld $31, %k6, %k1
-; SKX-NEXT: kshiftrd $19, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: kandd %k5, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $18, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: kandd %k4, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $17, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
+; SKX-NEXT: kshiftld $31, %k6, %k6
+; SKX-NEXT: kshiftrd $22, %k6, %k6
+; SKX-NEXT: kord %k6, %k0, %k0
+; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 4-byte Reload
+; SKX-NEXT: kandd %k6, %k0, %k0
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6
+; SKX-NEXT: kshiftld $31, %k6, %k6
+; SKX-NEXT: kshiftrd $21, %k6, %k6
+; SKX-NEXT: kord %k6, %k0, %k0
+; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 4-byte Reload
+; SKX-NEXT: kandd %k6, %k0, %k0
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6
+; SKX-NEXT: kshiftld $31, %k6, %k6
+; SKX-NEXT: kshiftrd $20, %k6, %k6
+; SKX-NEXT: kord %k6, %k0, %k0
; SKX-NEXT: kandd %k3, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $16, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3
+; SKX-NEXT: kshiftld $31, %k3, %k3
+; SKX-NEXT: kshiftrd $19, %k3, %k3
+; SKX-NEXT: kord %k3, %k0, %k0
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3
+; SKX-NEXT: kandd %k4, %k0, %k0
+; SKX-NEXT: kshiftld $31, %k3, %k3
+; SKX-NEXT: kshiftrd $18, %k3, %k3
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4
+; SKX-NEXT: kord %k3, %k0, %k0
+; SKX-NEXT: kandd %k5, %k0, %k0
+; SKX-NEXT: kshiftld $31, %k4, %k3
+; SKX-NEXT: kshiftrd $17, %k3, %k3
+; SKX-NEXT: kord %k3, %k0, %k0
; SKX-NEXT: kandd %k2, %k0, %k0
+; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT: kshiftld $31, %k2, %k2
+; SKX-NEXT: kshiftrd $16, %k2, %k2
+; SKX-NEXT: kord %k2, %k0, %k0
+; SKX-NEXT: kandd %k1, %k0, %k0
; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $15, %k1, %k1
@@ -1252,6 +1254,8 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; SKX-NEXT: kmovd %k1, %r13d
; SKX-NEXT: kshiftrd $14, %k0, %k1
; SKX-NEXT: andl $1, %edx
+; SKX-NEXT: shll $16, %edx
+; SKX-NEXT: shrl $16, %edx
; SKX-NEXT: movb %dl, 2(%rax)
; SKX-NEXT: kmovd %k0, %edx
; SKX-NEXT: andl $1, %edx
@@ -1565,6 +1569,8 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; KNL_X32-NEXT: kmovw %k1, %ecx
; KNL_X32-NEXT: kshiftrw $6, %k0, %k1
; KNL_X32-NEXT: andl $1, %ebx
+; KNL_X32-NEXT: shll $16, %ebx
+; KNL_X32-NEXT: shrl $16, %ebx
; KNL_X32-NEXT: movb %bl, 2(%eax)
; KNL_X32-NEXT: kmovw %k0, %ebx
; KNL_X32-NEXT: andl $1, %ebx
@@ -1645,19 +1651,19 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; FASTISEL-NEXT: pushq %r13
; FASTISEL-NEXT: pushq %r12
; FASTISEL-NEXT: pushq %rbx
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0
; FASTISEL-NEXT: movq %rdi, %rax
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0
; FASTISEL-NEXT: kshiftld $31, %k0, %k0
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
; FASTISEL-NEXT: kshiftrd $30, %k0, %k0
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $31, %k1, %k1
; FASTISEL-NEXT: kord %k0, %k1, %k0
; FASTISEL-NEXT: movl $-5, %edi
-; FASTISEL-NEXT: kmovd %edi, %k1
-; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
+; FASTISEL-NEXT: kmovd %edi, %k2
+; FASTISEL-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT: kandd %k2, %k0, %k0
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $29, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
@@ -1670,18 +1676,18 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; FASTISEL-NEXT: kshiftrd $28, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
; FASTISEL-NEXT: movl $-17, %edi
-; FASTISEL-NEXT: kmovd %edi, %k2
-; FASTISEL-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; FASTISEL-NEXT: kmovd %edi, %k1
+; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; FASTISEL-NEXT: kandd %k1, %k0, %k0
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kandd %k2, %k0, %k0
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $27, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
; FASTISEL-NEXT: movl $-33, %edi
-; FASTISEL-NEXT: kmovd %edi, %k1
-; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
+; FASTISEL-NEXT: kmovd %edi, %k2
+; FASTISEL-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT: kandd %k2, %k0, %k0
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $26, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
@@ -1694,18 +1700,18 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; FASTISEL-NEXT: kshiftrd $25, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
; FASTISEL-NEXT: movl $-129, %edi
-; FASTISEL-NEXT: kmovd %edi, %k2
-; FASTISEL-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; FASTISEL-NEXT: kmovd %edi, %k1
+; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; FASTISEL-NEXT: kandd %k1, %k0, %k0
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kandd %k2, %k0, %k0
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $24, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
; FASTISEL-NEXT: movl $-257, %edi ## imm = 0xFEFF
-; FASTISEL-NEXT: kmovd %edi, %k1
-; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
+; FASTISEL-NEXT: kmovd %edi, %k2
+; FASTISEL-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT: kandd %k2, %k0, %k0
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $23, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
@@ -1718,145 +1724,145 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; FASTISEL-NEXT: kshiftrd $22, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
; FASTISEL-NEXT: movl $-1025, %edi ## imm = 0xFBFF
-; FASTISEL-NEXT: kmovd %edi, %k2
-; FASTISEL-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; FASTISEL-NEXT: kmovd %edi, %k1
+; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; FASTISEL-NEXT: kandd %k1, %k0, %k0
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kandd %k2, %k0, %k0
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $21, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
; FASTISEL-NEXT: movl $-2049, %edi ## imm = 0xF7FF
-; FASTISEL-NEXT: kmovd %edi, %k6
-; FASTISEL-NEXT: kandd %k6, %k0, %k0
+; FASTISEL-NEXT: kmovd %edi, %k2
+; FASTISEL-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT: kandd %k2, %k0, %k0
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $20, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
; FASTISEL-NEXT: movl $-4097, %edi ## imm = 0xEFFF
-; FASTISEL-NEXT: kmovd %edi, %k1
-; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
+; FASTISEL-NEXT: kmovd %edi, %k3
+; FASTISEL-NEXT: kandd %k3, %k0, %k0
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $19, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
; FASTISEL-NEXT: movl $-8193, %edi ## imm = 0xDFFF
-; FASTISEL-NEXT: kmovd %edi, %k5
+; FASTISEL-NEXT: kmovd %edi, %k4
+; FASTISEL-NEXT: kandd %k4, %k0, %k0
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kandd %k5, %k0, %k0
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $18, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
; FASTISEL-NEXT: movl $-16385, %edi ## imm = 0xBFFF
-; FASTISEL-NEXT: kmovd %edi, %k4
-; FASTISEL-NEXT: kandd %k4, %k0, %k0
+; FASTISEL-NEXT: kmovd %edi, %k5
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT: kandd %k5, %k0, %k0
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $17, %k1, %k1
; FASTISEL-NEXT: kord %k1, %k0, %k0
; FASTISEL-NEXT: movl $-32769, %edi ## imm = 0xFFFF7FFF
-; FASTISEL-NEXT: kmovd %edi, %k3
-; FASTISEL-NEXT: kandd %k3, %k0, %k0
+; FASTISEL-NEXT: kmovd %edi, %k2
+; FASTISEL-NEXT: kandd %k2, %k0, %k0
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7
; FASTISEL-NEXT: kshiftld $31, %k7, %k7
; FASTISEL-NEXT: kshiftrd $16, %k7, %k7
; FASTISEL-NEXT: kord %k7, %k0, %k7
; FASTISEL-NEXT: movl $-65537, %edi ## imm = 0xFFFEFFFF
-; FASTISEL-NEXT: kmovd %edi, %k2
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0
-; FASTISEL-NEXT: kandd %k2, %k7, %k7
-; FASTISEL-NEXT: kshiftld $31, %k0, %k0
-; FASTISEL-NEXT: kshiftrd $15, %k0, %k0
-; FASTISEL-NEXT: kord %k0, %k7, %k0
+; FASTISEL-NEXT: kmovd %edi, %k1
+; FASTISEL-NEXT: kandd %k1, %k7, %k7
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6
+; FASTISEL-NEXT: kshiftld $31, %k6, %k6
+; FASTISEL-NEXT: kshiftrd $15, %k6, %k6
+; FASTISEL-NEXT: kord %k6, %k7, %k0
; FASTISEL-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; FASTISEL-NEXT: kmovd %edx, %k0
-; FASTISEL-NEXT: kshiftld $31, %k0, %k0
-; FASTISEL-NEXT: kshiftrd $30, %k0, %k0
+; FASTISEL-NEXT: kmovd %edx, %k6
+; FASTISEL-NEXT: kshiftld $31, %k6, %k6
+; FASTISEL-NEXT: kshiftrd $30, %k6, %k6
; FASTISEL-NEXT: kmovd %esi, %k7
; FASTISEL-NEXT: kshiftld $31, %k7, %k7
; FASTISEL-NEXT: kshiftrd $31, %k7, %k7
-; FASTISEL-NEXT: kord %k0, %k7, %k0
-; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
+; FASTISEL-NEXT: kord %k6, %k7, %k6
+; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; FASTISEL-NEXT: kandd %k0, %k6, %k6
; FASTISEL-NEXT: kmovd %ecx, %k7
; FASTISEL-NEXT: kshiftld $31, %k7, %k7
; FASTISEL-NEXT: kshiftrd $29, %k7, %k7
-; FASTISEL-NEXT: kord %k7, %k0, %k0
-; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
+; FASTISEL-NEXT: kord %k7, %k6, %k6
+; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; FASTISEL-NEXT: kandd %k0, %k6, %k6
; FASTISEL-NEXT: kmovd %r8d, %k7
; FASTISEL-NEXT: kshiftld $31, %k7, %k7
; FASTISEL-NEXT: kshiftrd $28, %k7, %k7
-; FASTISEL-NEXT: kord %k7, %k0, %k0
-; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
+; FASTISEL-NEXT: kord %k7, %k6, %k6
+; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; FASTISEL-NEXT: kandd %k0, %k6, %k6
; FASTISEL-NEXT: kmovd %r9d, %k7
; FASTISEL-NEXT: kshiftld $31, %k7, %k7
; FASTISEL-NEXT: kshiftrd $27, %k7, %k7
-; FASTISEL-NEXT: kord %k7, %k0, %k0
+; FASTISEL-NEXT: kord %k7, %k6, %k6
+; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; FASTISEL-NEXT: kandd %k0, %k6, %k6
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7
-; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT: kandd %k1, %k0, %k1
; FASTISEL-NEXT: kshiftld $31, %k7, %k7
; FASTISEL-NEXT: kshiftrd $26, %k7, %k7
+; FASTISEL-NEXT: kord %k7, %k6, %k6
+; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; FASTISEL-NEXT: kandd %k0, %k6, %k6
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7
+; FASTISEL-NEXT: kshiftld $31, %k7, %k7
+; FASTISEL-NEXT: kshiftrd $25, %k7, %k7
+; FASTISEL-NEXT: kord %k7, %k6, %k6
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7
+; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
+; FASTISEL-NEXT: kandd %k0, %k6, %k6
+; FASTISEL-NEXT: kshiftld $31, %k7, %k7
+; FASTISEL-NEXT: kshiftrd $24, %k7, %k7
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0
-; FASTISEL-NEXT: kord %k7, %k1, %k1
+; FASTISEL-NEXT: kord %k7, %k6, %k6
; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload
-; FASTISEL-NEXT: kandd %k7, %k1, %k1
+; FASTISEL-NEXT: kandd %k7, %k6, %k6
; FASTISEL-NEXT: kshiftld $31, %k0, %k0
-; FASTISEL-NEXT: kshiftrd $25, %k0, %k0
-; FASTISEL-NEXT: kord %k0, %k1, %k0
-; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kshiftld $31, %k1, %k1
-; FASTISEL-NEXT: kshiftrd $24, %k1, %k1
-; FASTISEL-NEXT: kord %k1, %k0, %k0
-; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kshiftld $31, %k1, %k1
-; FASTISEL-NEXT: kshiftrd $23, %k1, %k1
-; FASTISEL-NEXT: kord %k1, %k0, %k0
-; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kshiftld $31, %k1, %k1
-; FASTISEL-NEXT: kshiftrd $22, %k1, %k1
-; FASTISEL-NEXT: kord %k1, %k0, %k0
-; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kshiftld $31, %k1, %k1
-; FASTISEL-NEXT: kshiftrd $21, %k1, %k1
-; FASTISEL-NEXT: kord %k1, %k0, %k0
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT: kshiftrd $23, %k0, %k0
+; FASTISEL-NEXT: kord %k0, %k6, %k0
+; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 4-byte Reload
; FASTISEL-NEXT: kandd %k6, %k0, %k0
-; FASTISEL-NEXT: kshiftld $31, %k1, %k1
-; FASTISEL-NEXT: kshiftrd $20, %k1, %k1
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6
-; FASTISEL-NEXT: kord %k1, %k0, %k0
-; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT: kandd %k1, %k0, %k0
-; FASTISEL-NEXT: kshiftld $31, %k6, %k1
-; FASTISEL-NEXT: kshiftrd $19, %k1, %k1
-; FASTISEL-NEXT: kord %k1, %k0, %k0
-; FASTISEL-NEXT: kandd %k5, %k0, %k0
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kshiftld $31, %k1, %k1
-; FASTISEL-NEXT: kshiftrd $18, %k1, %k1
-; FASTISEL-NEXT: kord %k1, %k0, %k0
-; FASTISEL-NEXT: kandd %k4, %k0, %k0
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kshiftld $31, %k1, %k1
-; FASTISEL-NEXT: kshiftrd $17, %k1, %k1
-; FASTISEL-NEXT: kord %k1, %k0, %k0
+; FASTISEL-NEXT: kshiftld $31, %k6, %k6
+; FASTISEL-NEXT: kshiftrd $22, %k6, %k6
+; FASTISEL-NEXT: kord %k6, %k0, %k0
+; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 4-byte Reload
+; FASTISEL-NEXT: kandd %k6, %k0, %k0
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6
+; FASTISEL-NEXT: kshiftld $31, %k6, %k6
+; FASTISEL-NEXT: kshiftrd $21, %k6, %k6
+; FASTISEL-NEXT: kord %k6, %k0, %k0
+; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 4-byte Reload
+; FASTISEL-NEXT: kandd %k6, %k0, %k0
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6
+; FASTISEL-NEXT: kshiftld $31, %k6, %k6
+; FASTISEL-NEXT: kshiftrd $20, %k6, %k6
+; FASTISEL-NEXT: kord %k6, %k0, %k0
; FASTISEL-NEXT: kandd %k3, %k0, %k0
-; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT: kshiftld $31, %k1, %k1
-; FASTISEL-NEXT: kshiftrd $16, %k1, %k1
-; FASTISEL-NEXT: kord %k1, %k0, %k0
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k3
+; FASTISEL-NEXT: kshiftld $31, %k3, %k3
+; FASTISEL-NEXT: kshiftrd $19, %k3, %k3
+; FASTISEL-NEXT: kord %k3, %k0, %k0
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k3
+; FASTISEL-NEXT: kandd %k4, %k0, %k0
+; FASTISEL-NEXT: kshiftld $31, %k3, %k3
+; FASTISEL-NEXT: kshiftrd $18, %k3, %k3
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k4
+; FASTISEL-NEXT: kord %k3, %k0, %k0
+; FASTISEL-NEXT: kandd %k5, %k0, %k0
+; FASTISEL-NEXT: kshiftld $31, %k4, %k3
+; FASTISEL-NEXT: kshiftrd $17, %k3, %k3
+; FASTISEL-NEXT: kord %k3, %k0, %k0
; FASTISEL-NEXT: kandd %k2, %k0, %k0
+; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2
+; FASTISEL-NEXT: kshiftld $31, %k2, %k2
+; FASTISEL-NEXT: kshiftrd $16, %k2, %k2
+; FASTISEL-NEXT: kord %k2, %k0, %k0
+; FASTISEL-NEXT: kandd %k1, %k0, %k0
; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
; FASTISEL-NEXT: kshiftld $31, %k1, %k1
; FASTISEL-NEXT: kshiftrd $15, %k1, %k1
@@ -1893,6 +1899,8 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; FASTISEL-NEXT: kmovd %k1, %r13d
; FASTISEL-NEXT: kshiftrd $14, %k0, %k1
; FASTISEL-NEXT: andl $1, %edx
+; FASTISEL-NEXT: shll $16, %edx
+; FASTISEL-NEXT: shrl $16, %edx
; FASTISEL-NEXT: movb %dl, 2(%rax)
; FASTISEL-NEXT: kmovd %k0, %edx
; FASTISEL-NEXT: andl $1, %edx
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 39d8e2d50c91ea..ab9f3be9717a99 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -542,14 +542,13 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: subq $56, %rsp
+; SSE2-NEXT: subq $72, %rsp
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: shrq $48, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %xmm1, %rdx
-; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rdx, %rax
; SSE2-NEXT: shrq $48, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -557,82 +556,89 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; SSE2-NEXT: shrq $32, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: shrq $32, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movl %ecx, %eax
+; SSE2-NEXT: shrl $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
-; SSE2-NEXT: movq %xmm0, %r15
-; SSE2-NEXT: movq %r15, %rbx
-; SSE2-NEXT: shrq $48, %rbx
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: movq %rax, %r15
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1]
-; SSE2-NEXT: movq %xmm1, %r14
-; SSE2-NEXT: movq %r14, %rbp
-; SSE2-NEXT: shrq $48, %rbp
-; SSE2-NEXT: movq %r15, %r12
-; SSE2-NEXT: shrq $32, %r12
-; SSE2-NEXT: movq %r14, %r13
-; SSE2-NEXT: shrq $32, %r13
-; SSE2-NEXT: movl %r14d, %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: movl %r15d, %eax
+; SSE2-NEXT: movq %xmm1, %rcx
+; SSE2-NEXT: movq %rcx, %r12
+; SSE2-NEXT: movq %rax, %rbp
+; SSE2-NEXT: movq %rcx, %r13
+; SSE2-NEXT: movl %ecx, %r14d
+; SSE2-NEXT: shll $16, %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movl %eax, %ebx
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movl %edx, %eax
+; SSE2-NEXT: shrl $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: shrq $48, %r15
+; SSE2-NEXT: shrq $48, %r12
+; SSE2-NEXT: shrq $32, %rbp
+; SSE2-NEXT: shrq $32, %r13
+; SSE2-NEXT: shrl $16, %ebx
+; SSE2-NEXT: shrl $16, %r14d
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: andl $-65536, %r14d # imm = 0xFFFF0000
+; SSE2-NEXT: shll $16, %r14d
; SSE2-NEXT: movd %r14d, %xmm1
-; SSE2-NEXT: andl $-65536, %r15d # imm = 0xFFFF0000
-; SSE2-NEXT: movd %r15d, %xmm0
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd %ebx, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: pextrw $0, %xmm0, %r15d
-; SSE2-NEXT: shll $16, %r15d
-; SSE2-NEXT: addl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
+; SSE2-NEXT: pextrw $0, %xmm0, %r14d
+; SSE2-NEXT: shll $16, %r14d
+; SSE2-NEXT: addl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
; SSE2-NEXT: shll $16, %r13d
; SSE2-NEXT: movd %r13d, %xmm1
-; SSE2-NEXT: shll $16, %r12d
-; SSE2-NEXT: movd %r12d, %xmm0
+; SSE2-NEXT: shll $16, %ebp
+; SSE2-NEXT: movd %ebp, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
-; SSE2-NEXT: movzwl %ax, %r14d
-; SSE2-NEXT: shll $16, %ebp
-; SSE2-NEXT: movd %ebp, %xmm1
-; SSE2-NEXT: shll $16, %ebx
-; SSE2-NEXT: movd %ebx, %xmm0
+; SSE2-NEXT: movzwl %ax, %ebx
+; SSE2-NEXT: shll $16, %r12d
+; SSE2-NEXT: movd %r12d, %xmm1
+; SSE2-NEXT: shll $16, %r15d
+; SSE2-NEXT: movd %r15d, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: pextrw $0, %xmm0, %ebx
-; SSE2-NEXT: shll $16, %ebx
-; SSE2-NEXT: orl %r14d, %ebx
-; SSE2-NEXT: shlq $32, %rbx
-; SSE2-NEXT: orq %r15, %rbx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT: movl %r15d, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %r15d
+; SSE2-NEXT: shll $16, %r15d
+; SSE2-NEXT: orl %ebx, %r15d
+; SSE2-NEXT: shlq $32, %r15
+; SSE2-NEXT: orq %r14, %r15
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT: movl %r14d, %eax
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %ebp
-; SSE2-NEXT: movq %r15, %rax
-; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: movq %r14, %rax
-; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: pextrw $0, %xmm0, %r14d
-; SSE2-NEXT: shll $16, %r14d
-; SSE2-NEXT: orl %ebp, %r14d
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: orl %ebp, %ebx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
@@ -655,11 +661,11 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: orl %ebp, %eax
; SSE2-NEXT: shlq $32, %rax
-; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: orq %rbx, %rax
; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: movq %rbx, %xmm1
+; SSE2-NEXT: movq %r15, %xmm1
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: addq $56, %rsp
+; SSE2-NEXT: addq $72, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -1590,13 +1596,13 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
; SSE2-LABEL: pr62997_3:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; SSE2-NEXT: andq %rax, %rcx
-; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: movzwl %ax, %ecx
+; SSE2-NEXT: shrq $32, %rax
+; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: pextrw $0, %xmm4, %edx
; SSE2-NEXT: shll $16, %edx
-; SSE2-NEXT: orl %eax, %edx
-; SSE2-NEXT: orq %rcx, %rdx
+; SSE2-NEXT: orl %ecx, %edx
+; SSE2-NEXT: orq %rax, %rdx
; SSE2-NEXT: movq %rdx, %xmm4
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 4f2654843728f4..c06a89e50e9c0d 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -12,42 +12,46 @@ declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
; X86-LABEL: test_bitreverse_v2i16:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: rolw $8, %ax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: andl $3855, %edx # imm = 0xF0F
-; X86-NEXT: shll $4, %edx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
+; X86-NEXT: shlw $4, %cx
; X86-NEXT: shrl $4, %eax
; X86-NEXT: andl $3855, %eax # imm = 0xF0F
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: andl $13107, %edx # imm = 0x3333
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $13107, %ecx # imm = 0x3333
+; X86-NEXT: shlw $2, %cx
; X86-NEXT: shrl $2, %eax
; X86-NEXT: andl $13107, %eax # imm = 0x3333
-; X86-NEXT: leal (%eax,%edx,4), %eax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: andl $21845, %edx # imm = 0x5555
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $21845, %ecx # imm = 0x5555
+; X86-NEXT: addw %cx, %cx
; X86-NEXT: shrl %eax
; X86-NEXT: andl $21845, %eax # imm = 0x5555
-; X86-NEXT: leal (%eax,%edx,2), %eax
-; X86-NEXT: rolw $8, %cx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: andl $3855, %edx # imm = 0xF0F
-; X86-NEXT: shll $4, %edx
-; X86-NEXT: shrl $4, %ecx
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: rolw $8, %dx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: andl $13107, %edx # imm = 0x3333
-; X86-NEXT: shrl $2, %ecx
+; X86-NEXT: shlw $4, %cx
+; X86-NEXT: shrl $4, %edx
+; X86-NEXT: andl $3855, %edx # imm = 0xF0F
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: andl $13107, %ecx # imm = 0x3333
-; X86-NEXT: leal (%ecx,%edx,4), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: andl $21845, %edx # imm = 0x5555
-; X86-NEXT: shrl %ecx
+; X86-NEXT: shlw $2, %cx
+; X86-NEXT: shrl $2, %edx
+; X86-NEXT: andl $13107, %edx # imm = 0x3333
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: andl $21845, %ecx # imm = 0x5555
-; X86-NEXT: leal (%ecx,%edx,2), %edx
+; X86-NEXT: addw %cx, %cx
+; X86-NEXT: shrl %edx
+; X86-NEXT: andl $21845, %edx # imm = 0x5555
+; X86-NEXT: orl %ecx, %edx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: # kill: def $dx killed $dx killed $edx
; X86-NEXT: retl
@@ -324,43 +328,46 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; X86-NEXT: rolw $8, %ax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
-; X86-NEXT: shll $4, %ecx
+; X86-NEXT: shlw $4, %cx
; X86-NEXT: shrl $4, %eax
; X86-NEXT: andl $3855, %eax # imm = 0xF0F
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $13107, %ecx # imm = 0x3333
+; X86-NEXT: shlw $2, %cx
; X86-NEXT: shrl $2, %eax
; X86-NEXT: andl $13107, %eax # imm = 0x3333
-; X86-NEXT: leal (%eax,%ecx,4), %eax
+; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $21845, %ecx # imm = 0x5555
+; X86-NEXT: addw %cx, %cx
; X86-NEXT: shrl %eax
; X86-NEXT: andl $21845, %eax # imm = 0x5555
-; X86-NEXT: leal (%eax,%ecx,2), %eax
+; X86-NEXT: orl %ecx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_bitreverse_i16:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: rolw $8, %di
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
-; X64-NEXT: shll $4, %eax
+; X64-NEXT: shlw $4, %ax
; X64-NEXT: shrl $4, %edi
; X64-NEXT: andl $3855, %edi # imm = 0xF0F
-; X64-NEXT: orl %eax, %edi
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: orl %edi, %eax
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: andl $13107, %ecx # imm = 0x3333
+; X64-NEXT: shlw $2, %cx
+; X64-NEXT: shrl $2, %eax
; X64-NEXT: andl $13107, %eax # imm = 0x3333
-; X64-NEXT: shrl $2, %edi
-; X64-NEXT: andl $13107, %edi # imm = 0x3333
-; X64-NEXT: leal (%rdi,%rax,4), %eax
+; X64-NEXT: orl %ecx, %eax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: andl $21845, %ecx # imm = 0x5555
+; X64-NEXT: addw %cx, %cx
; X64-NEXT: shrl %eax
; X64-NEXT: andl $21845, %eax # imm = 0x5555
-; X64-NEXT: leal (%rax,%rcx,2), %eax
+; X64-NEXT: orl %ecx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/bool-math.ll b/llvm/test/CodeGen/X86/bool-math.ll
index b73af677bc6cdd..0beb4dd10e7480 100644
--- a/llvm/test/CodeGen/X86/bool-math.ll
+++ b/llvm/test/CodeGen/X86/bool-math.ll
@@ -262,10 +262,12 @@ define i8 @low_bit_select_constants_bigger_true_narrower_result(i16 %x) {
define i1 @opaque_constant(i48 %x, i48 %y) {
; X64-LABEL: opaque_constant:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: xorq %rsi, %rax
+; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT: andq %rax, %rdi
+; X64-NEXT: andq %rsi, %rax
+; X64-NEXT: shrq $32, %rdi
; X64-NEXT: shrq $32, %rax
-; X64-NEXT: andl $1, %eax
+; X64-NEXT: xorl %edi, %eax
; X64-NEXT: # kill: def $al killed $al killed $rax
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/bool-vector.ll b/llvm/test/CodeGen/X86/bool-vector.ll
index d52b455eb2e6b0..c3eaded0595268 100644
--- a/llvm/test/CodeGen/X86/bool-vector.ll
+++ b/llvm/test/CodeGen/X86/bool-vector.ll
@@ -91,18 +91,6 @@ define i32 @PR15215_good(<4 x i32> %input) nounwind {
; X64-NEXT: leal (%rax,%rdx,4), %eax
; X64-NEXT: leal (%rax,%rcx,8), %eax
; X64-NEXT: retq
-;
-; SSE2-LABEL: PR15215_good:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pslld $31, %xmm0
-; SSE2-NEXT: movmskps %xmm0, %eax
-; SSE2-NEXT: ret{{[l|q]}}
-;
-; AVX2-LABEL: PR15215_good:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX2-NEXT: vmovmskps %xmm0, %eax
-; AVX2-NEXT: ret{{[l|q]}}
entry:
%0 = trunc <4 x i32> %input to <4 x i1>
%1 = extractelement <4 x i1> %0, i32 0
diff --git a/llvm/test/CodeGen/X86/btc_bts_btr.ll b/llvm/test/CodeGen/X86/btc_bts_btr.ll
index efd9d1105d975f..acc3385d57a67f 100644
--- a/llvm/test/CodeGen/X86/btc_bts_btr.ll
+++ b/llvm/test/CodeGen/X86/btc_bts_btr.ll
@@ -237,20 +237,20 @@ define i16 @btr_16_mask(i16 %x, i16 %n) {
define i16 @bts_16_mask(i16 %x, i16 %n) {
; X64-LABEL: bts_16_mask:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andb $15, %sil
-; X64-NEXT: btsl %esi, %eax
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andb $15, %cl
+; X64-NEXT: movw $1, %ax
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: shlw %cl, %ax
+; X64-NEXT: orl %edi, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
; X86-LABEL: bts_16_mask:
; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: orw {{[0-9]+}}(%esp), %ax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: btsw %cx, %ax
; X86-NEXT: retl
%1 = and i16 %n, 15
%2 = shl i16 1, %1
@@ -261,20 +261,20 @@ define i16 @bts_16_mask(i16 %x, i16 %n) {
define i16 @btc_16_mask(i16 %x, i16 %n) {
; X64-LABEL: btc_16_mask:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andb $15, %sil
-; X64-NEXT: btcl %esi, %eax
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andb $15, %cl
+; X64-NEXT: movw $1, %ax
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: shlw %cl, %ax
+; X64-NEXT: xorl %edi, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
; X86-LABEL: btc_16_mask:
; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: btcw %cx, %ax
; X86-NEXT: retl
%1 = and i16 %n, 15
%2 = shl i16 1, %1
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index ae70b6a5a46656..b645f1bc7d0a9c 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -11,6 +11,13 @@ define void @foo(<3 x float> %in, ptr nocapture %out) nounwind {
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzbl %al, %ecx
+; SSE2-NEXT: movzbl %ah, %edx
+; SSE2-NEXT: shrl $16, %eax
+; SSE2-NEXT: shll $8, %edx
+; SSE2-NEXT: orl %ecx, %edx
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: orl %edx, %eax
; SSE2-NEXT: orl $-16777216, %eax # imm = 0xFF000000
; SSE2-NEXT: movl %eax, (%rdi)
; SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll
index 65d74c8f262a31..267fe5b37dafb6 100644
--- a/llvm/test/CodeGen/X86/combine-rotates.ll
+++ b/llvm/test/CodeGen/X86/combine-rotates.ll
@@ -440,7 +440,8 @@ define i5 @rotl_merge_i5(i5 %x) {
; CHECK-NEXT: leal (,%rdi,4), %ecx
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: andb $24, %al
-; CHECK-NEXT: shrb $3, %al
+; CHECK-NEXT: shrb %al
+; CHECK-NEXT: shrb $2, %al
; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: retq
%r1 = call i5 @llvm.fshl.i5(i5 %x, i5 %x, i5 -1)
diff --git a/llvm/test/CodeGen/X86/combine-sext-in-reg.ll b/llvm/test/CodeGen/X86/combine-sext-in-reg.ll
index 686945a7bcd9e0..e13bebe7e63aef 100644
--- a/llvm/test/CodeGen/X86/combine-sext-in-reg.ll
+++ b/llvm/test/CodeGen/X86/combine-sext-in-reg.ll
@@ -6,15 +6,29 @@
define <4 x i64> @sextinreg_zext_v16i8_4i64(<16 x i8> %a0) {
; SSE-LABEL: sextinreg_zext_v16i8_4i64:
; SSE: # %bb.0:
-; SSE-NEXT: pmovsxbq %xmm0, %xmm2
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pmovsxbq %xmm0, %xmm1
+; SSE-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3]
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: psllq $56, %xmm1
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pslld $24, %xmm2
+; SSE-NEXT: psrad $24, %xmm2
+; SSE-NEXT: por %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: psrad $24, %xmm1
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sextinreg_zext_v16i8_4i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbq %xmm0, %ymm0
+; AVX-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpsllq $56, %ymm0, %ymm1
+; AVX-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX-NEXT: vpslld $24, %ymm0, %ymm0
+; AVX-NEXT: vpsrad $24, %ymm0, %ymm0
+; AVX-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = zext <4 x i8> %1 to <4 x i64>
@@ -27,15 +41,24 @@ define <4 x i64> @sextinreg_zext_v16i8_4i64(<16 x i8> %a0) {
define <4 x i64> @sextinreg_zext_sext_v16i8_4i64(<16 x i8> %a0) {
; SSE-LABEL: sextinreg_zext_sext_v16i8_4i64:
; SSE: # %bb.0:
-; SSE-NEXT: pmovsxbq %xmm0, %xmm2
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pmovsxbq %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pmovsxbd %xmm0, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,1]
+; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: sextinreg_zext_sext_v16i8_4i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbq %xmm0, %ymm0
+; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vpsllq $32, %ymm0, %ymm1
+; AVX-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = sext <4 x i8> %1 to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index 5472e1e6c0833e..1523a24ac90f00 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -254,26 +254,29 @@ define <4 x i32> @combine_vec_shl_shl_zero1(<4 x i32> %x) {
define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
; SSE2-LABEL: combine_vec_shl_ext_shl0:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pslld $20, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $20, %xmm1
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_ext_shl0:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pslld $20, %xmm1
-; SSE41-NEXT: pslld $20, %xmm0
+; SSE41-NEXT: psllw $4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
+; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
+; SSE41-NEXT: pslld $16, %xmm0
+; SSE41-NEXT: pslld $16, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_ext_shl0:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpslld $20, %ymm0, %ymm0
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX-NEXT: vpslld $16, %ymm0, %ymm0
; AVX-NEXT: retq
%1 = shl <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
%2 = sext <8 x i16> %1 to <8 x i32>
@@ -345,34 +348,32 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
; SSE2-LABEL: combine_vec_shl_zext_lshr0:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: pslld $4, %xmm0
+; SSE2-NEXT: pslld $4, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_zext_lshr0:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: pslld $4, %xmm1
+; SSE41-NEXT: pslld $4, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: combine_vec_shl_zext_lshr0:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: combine_vec_shl_zext_lshr0:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: retq
+; AVX-LABEL: combine_vec_shl_zext_lshr0:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT: vpslld $4, %ymm0, %ymm0
+; AVX-NEXT: retq
%1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
%2 = zext <8 x i16> %1 to <8 x i32>
%3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -382,28 +383,49 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
; SSE2-LABEL: combine_vec_shl_zext_lshr1:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_zext_lshr1:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: combine_vec_shl_zext_lshr1:
-; AVX: # %bb.0:
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: retq
+; AVX2-LABEL: combine_vec_shl_zext_lshr1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_shl_zext_lshr1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
%1 = lshr <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 15>
%2 = zext <8 x i16> %1 to <8 x i32>
%3 = shl <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 15>
@@ -473,12 +495,14 @@ define i32 @combine_shl_ge_sel_ashr_exact0(i32 %x, i32 %y, i32 %z) {
define <4 x i32> @combine_vec_shl_lt_ashr_exact0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_lt_ashr_exact0:
; SSE: # %bb.0:
-; SSE-NEXT: psrad $2, %xmm0
+; SSE-NEXT: psrad $5, %xmm0
+; SSE-NEXT: pslld $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_lt_ashr_exact0:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $5, %xmm0, %xmm0
+; AVX-NEXT: vpslld $3, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = ashr exact <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
%2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
@@ -557,22 +581,15 @@ define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_le_lshr0:
; SSE: # %bb.0:
-; SSE-NEXT: psrld $2, %xmm0
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: psrld $5, %xmm0
+; SSE-NEXT: pslld $3, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: combine_vec_shl_le_lshr0:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: combine_vec_shl_le_lshr0:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $2, %xmm0, %xmm0
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: combine_vec_shl_le_lshr0:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrld $5, %xmm0, %xmm0
+; AVX-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
%2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
ret <4 x i32> %2
@@ -611,19 +628,15 @@ define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_ashr0:
; SSE: # %bb.0:
-; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: psrld $5, %xmm0
+; SSE-NEXT: pslld $5, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: combine_vec_shl_ashr0:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
-; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: combine_vec_shl_ashr0:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: combine_vec_shl_ashr0:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrld $5, %xmm0, %xmm0
+; AVX-NEXT: vpslld $5, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
%2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
ret <4 x i32> %2
diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll
index 49ce2455ae8c7a..983b5c1a86ed0e 100644
--- a/llvm/test/CodeGen/X86/combine-srem.ll
+++ b/llvm/test/CodeGen/X86/combine-srem.ll
@@ -59,7 +59,8 @@ define i32 @combine_srem_by_minsigned(i32 %x) {
; CHECK-NEXT: leal 2147483647(%rdi), %eax
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovnsl %edi, %eax
-; CHECK-NEXT: andl $-2147483648, %eax # imm = 0x80000000
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: shll $31, %eax
; CHECK-NEXT: addl %edi, %eax
; CHECK-NEXT: retq
%1 = srem i32 %x, -2147483648
@@ -73,28 +74,20 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) {
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: psrld $1, %xmm1
; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: psrld $31, %xmm1
+; SSE-NEXT: pslld $31, %xmm1
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: combine_vec_srem_by_minsigned:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_vec_srem_by_minsigned:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_vec_srem_by_minsigned:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpsrld $31, %xmm1, %xmm1
+; AVX-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%1 = srem <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
ret <4 x i32> %1
}
@@ -192,28 +185,20 @@ define <4 x i32> @combine_vec_srem_by_pow2a(<4 x i32> %x) {
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: psrld $30, %xmm1
; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: psrld $2, %xmm1
+; SSE-NEXT: pslld $2, %xmm1
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: combine_vec_srem_by_pow2a:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $30, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_vec_srem_by_pow2a:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpsrld $30, %xmm1, %xmm1
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967292,4294967292,4294967292,4294967292]
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_vec_srem_by_pow2a:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpsrld $2, %xmm1, %xmm1
+; AVX-NEXT: vpslld $2, %xmm1, %xmm1
+; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = srem <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
ret <4 x i32> %1
}
@@ -450,7 +435,8 @@ define i32 @combine_srem_two(i32 %x) {
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: shrl $31, %ecx
; CHECK-NEXT: addl %edi, %ecx
-; CHECK-NEXT: andl $-2, %ecx
+; CHECK-NEXT: sarl %ecx
+; CHECK-NEXT: addl %ecx, %ecx
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: retq
%1 = srem i32 %x, 2
@@ -464,7 +450,8 @@ define i32 @combine_srem_negtwo(i32 %x) {
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: shrl $31, %ecx
; CHECK-NEXT: addl %edi, %ecx
-; CHECK-NEXT: andl $-2, %ecx
+; CHECK-NEXT: sarl %ecx
+; CHECK-NEXT: addl %ecx, %ecx
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: retq
%1 = srem i32 %x, -2
@@ -479,7 +466,8 @@ define i8 @combine_i8_srem_negpow2(i8 %x) {
; CHECK-NEXT: sarb $7, %cl
; CHECK-NEXT: shrb $2, %cl
; CHECK-NEXT: addb %al, %cl
-; CHECK-NEXT: andb $-64, %cl
+; CHECK-NEXT: sarb $6, %cl
+; CHECK-NEXT: shlb $6, %cl
; CHECK-NEXT: subb %cl, %al
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
@@ -494,7 +482,9 @@ define i16 @combine_i16_srem_pow2(i16 %x) {
; CHECK-NEXT: leal 15(%rax), %ecx
; CHECK-NEXT: testw %ax, %ax
; CHECK-NEXT: cmovnsl %edi, %ecx
-; CHECK-NEXT: andl $-16, %ecx
+; CHECK-NEXT: movzwl %cx, %ecx
+; CHECK-NEXT: shrl $4, %ecx
+; CHECK-NEXT: shlw $4, %cx
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $rax
; CHECK-NEXT: retq
@@ -509,7 +499,8 @@ define i16 @combine_i16_srem_negpow2(i16 %x) {
; CHECK-NEXT: leal 255(%rax), %ecx
; CHECK-NEXT: testw %ax, %ax
; CHECK-NEXT: cmovnsl %edi, %ecx
-; CHECK-NEXT: andl $-256, %ecx
+; CHECK-NEXT: movzbl %ch, %ecx
+; CHECK-NEXT: shlw $8, %cx
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $rax
; CHECK-NEXT: retq
@@ -524,7 +515,8 @@ define i32 @combine_srem_pow2(i32 %x) {
; CHECK-NEXT: leal 15(%rax), %ecx
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovnsl %edi, %ecx
-; CHECK-NEXT: andl $-16, %ecx
+; CHECK-NEXT: sarl $4, %ecx
+; CHECK-NEXT: shll $4, %ecx
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
@@ -539,7 +531,8 @@ define i32 @combine_srem_negpow2(i32 %x) {
; CHECK-NEXT: leal 255(%rax), %ecx
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovnsl %edi, %ecx
-; CHECK-NEXT: andl $-256, %ecx
+; CHECK-NEXT: sarl $8, %ecx
+; CHECK-NEXT: shll $8, %ecx
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
@@ -554,7 +547,8 @@ define i64 @combine_i64_srem_pow2(i64 %x) {
; CHECK-NEXT: leaq 15(%rdi), %rcx
; CHECK-NEXT: testq %rdi, %rdi
; CHECK-NEXT: cmovnsq %rdi, %rcx
-; CHECK-NEXT: andq $-16, %rcx
+; CHECK-NEXT: sarq $4, %rcx
+; CHECK-NEXT: shlq $4, %rcx
; CHECK-NEXT: subq %rcx, %rax
; CHECK-NEXT: retq
%1 = srem i64 %x, 16
@@ -568,7 +562,8 @@ define i64 @combine_i64_srem_negpow2(i64 %x) {
; CHECK-NEXT: leaq 255(%rdi), %rcx
; CHECK-NEXT: testq %rdi, %rdi
; CHECK-NEXT: cmovnsq %rdi, %rcx
-; CHECK-NEXT: andq $-256, %rcx
+; CHECK-NEXT: sarq $8, %rcx
+; CHECK-NEXT: shlq $8, %rcx
; CHECK-NEXT: subq %rcx, %rax
; CHECK-NEXT: retq
%1 = srem i64 %x, -256
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 33649e6d87b915..1cb9e411483a3f 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -347,33 +347,60 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_shl_mask0:
; SSE: # %bb.0:
-; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pslld $2, %xmm0
+; SSE-NEXT: psrld $2, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: combine_vec_lshr_shl_mask0:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
-; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: combine_vec_lshr_shl_mask0:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: combine_vec_lshr_shl_mask0:
+; AVX: # %bb.0:
+; AVX-NEXT: vpslld $2, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
%2 = lshr <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
ret <4 x i32> %2
}
define <4 x i32> @combine_vec_lshr_shl_mask1(<4 x i32> %x) {
-; SSE-LABEL: combine_vec_lshr_shl_mask1:
-; SSE: # %bb.0:
-; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_vec_lshr_shl_mask1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: psrld $5, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrld $4, %xmm3
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE2-NEXT: psrld $3, %xmm2
+; SSE2-NEXT: psrld $2, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[0,3]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_lshr_shl_mask1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrld $3, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $4, %xmm1
+; SSE41-NEXT: psrld $2, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_shl_mask1:
; AVX: # %bb.0:
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,3,4,5]
+; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 2, i32 3, i32 4, i32 5>
%2 = lshr <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
index 142ac754c3f7e8..3977a859b24e8d 100644
--- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
@@ -149,12 +149,14 @@ define i8 @test_i8_224_mask_lshr_5(i8 %a0) {
; X86-LABEL: test_i8_224_mask_lshr_5:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andb $-32, %al
; X86-NEXT: shrb $5, %al
; X86-NEXT: retl
;
; X64-LABEL: test_i8_224_mask_lshr_5:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andb $-32, %al
; X64-NEXT: shrb $5, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
@@ -187,14 +189,14 @@ define i8 @test_i8_7_mask_ashr_1(i8 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andb $6, %al
-; X86-NEXT: shrb %al
+; X86-NEXT: sarb %al
; X86-NEXT: retl
;
; X64-LABEL: test_i8_7_mask_ashr_1:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $6, %al
-; X64-NEXT: shrb %al
+; X64-NEXT: sarb %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%t0 = and i8 %a0, 7
@@ -207,14 +209,14 @@ define i8 @test_i8_28_mask_ashr_1(i8 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andb $28, %al
-; X86-NEXT: shrb %al
+; X86-NEXT: sarb %al
; X86-NEXT: retl
;
; X64-LABEL: test_i8_28_mask_ashr_1:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $28, %al
-; X64-NEXT: shrb %al
+; X64-NEXT: sarb %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%t0 = and i8 %a0, 28
@@ -226,14 +228,14 @@ define i8 @test_i8_28_mask_ashr_2(i8 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andb $28, %al
-; X86-NEXT: shrb $2, %al
+; X86-NEXT: sarb $2, %al
; X86-NEXT: retl
;
; X64-LABEL: test_i8_28_mask_ashr_2:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $28, %al
-; X64-NEXT: shrb $2, %al
+; X64-NEXT: sarb $2, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%t0 = and i8 %a0, 28
@@ -245,14 +247,14 @@ define i8 @test_i8_28_mask_ashr_3(i8 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andb $24, %al
-; X86-NEXT: shrb $3, %al
+; X86-NEXT: sarb $3, %al
; X86-NEXT: retl
;
; X64-LABEL: test_i8_28_mask_ashr_3:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $24, %al
-; X64-NEXT: shrb $3, %al
+; X64-NEXT: sarb $3, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%t0 = and i8 %a0, 28
@@ -264,14 +266,14 @@ define i8 @test_i8_28_mask_ashr_4(i8 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andb $16, %al
-; X86-NEXT: shrb $4, %al
+; X86-NEXT: sarb $4, %al
; X86-NEXT: retl
;
; X64-LABEL: test_i8_28_mask_ashr_4:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $16, %al
-; X64-NEXT: shrb $4, %al
+; X64-NEXT: sarb $4, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%t0 = and i8 %a0, 28
@@ -321,12 +323,14 @@ define i8 @test_i8_224_mask_ashr_5(i8 %a0) {
; X86-LABEL: test_i8_224_mask_ashr_5:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andb $-32, %al
; X86-NEXT: sarb $5, %al
; X86-NEXT: retl
;
; X64-LABEL: test_i8_224_mask_ashr_5:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andb $-32, %al
; X64-NEXT: sarb $5, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
@@ -396,12 +400,14 @@ define i8 @test_i8_7_mask_shl_5(i8 %a0) {
; X86-LABEL: test_i8_7_mask_shl_5:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andb $7, %al
; X86-NEXT: shlb $5, %al
; X86-NEXT: retl
;
; X64-LABEL: test_i8_7_mask_shl_5:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andb $7, %al
; X64-NEXT: shlb $5, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
@@ -535,7 +541,7 @@ define i16 @test_i16_127_mask_lshr_1(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $126, %eax
-; X86-NEXT: shrl %eax
+; X86-NEXT: shrw %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -543,7 +549,7 @@ define i16 @test_i16_127_mask_lshr_1(i16 %a0) {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $126, %eax
-; X64-NEXT: shrl %eax
+; X64-NEXT: shrw %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 127
@@ -556,7 +562,7 @@ define i16 @test_i16_2032_mask_lshr_3(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $2032, %eax # imm = 0x7F0
-; X86-NEXT: shrl $3, %eax
+; X86-NEXT: shrw $3, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -564,7 +570,7 @@ define i16 @test_i16_2032_mask_lshr_3(i16 %a0) {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $2032, %eax # imm = 0x7F0
-; X64-NEXT: shrl $3, %eax
+; X64-NEXT: shrw $3, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -575,16 +581,16 @@ define i16 @test_i16_2032_mask_lshr_4(i16 %a0) {
; X86-LABEL: test_i16_2032_mask_lshr_4:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $4, %eax
-; X86-NEXT: andl $127, %eax
+; X86-NEXT: andl $2032, %eax # imm = 0x7F0
+; X86-NEXT: shrw $4, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_2032_mask_lshr_4:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $4, %eax
-; X64-NEXT: andl $127, %eax
+; X64-NEXT: andl $2032, %eax # imm = 0x7F0
+; X64-NEXT: shrw $4, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -595,16 +601,16 @@ define i16 @test_i16_2032_mask_lshr_5(i16 %a0) {
; X86-LABEL: test_i16_2032_mask_lshr_5:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $5, %eax
-; X86-NEXT: andl $63, %eax
+; X86-NEXT: andl $2016, %eax # imm = 0x7E0
+; X86-NEXT: shrw $5, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_2032_mask_lshr_5:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $5, %eax
-; X64-NEXT: andl $63, %eax
+; X64-NEXT: andl $2016, %eax # imm = 0x7E0
+; X64-NEXT: shrw $5, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -615,16 +621,16 @@ define i16 @test_i16_2032_mask_lshr_6(i16 %a0) {
; X86-LABEL: test_i16_2032_mask_lshr_6:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $6, %eax
-; X86-NEXT: andl $31, %eax
+; X86-NEXT: andl $1984, %eax # imm = 0x7C0
+; X86-NEXT: shrw $6, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_2032_mask_lshr_6:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $6, %eax
-; X64-NEXT: andl $31, %eax
+; X64-NEXT: andl $1984, %eax # imm = 0x7C0
+; X64-NEXT: shrw $6, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -637,7 +643,7 @@ define i16 @test_i16_65024_mask_lshr_1(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $65024, %eax # imm = 0xFE00
-; X86-NEXT: shrl %eax
+; X86-NEXT: shrw %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -645,7 +651,7 @@ define i16 @test_i16_65024_mask_lshr_1(i16 %a0) {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $65024, %eax # imm = 0xFE00
-; X64-NEXT: shrl %eax
+; X64-NEXT: shrw %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 65024
@@ -657,7 +663,7 @@ define i16 @test_i16_65024_mask_lshr_8(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $65024, %eax # imm = 0xFE00
-; X86-NEXT: shrl $8, %eax
+; X86-NEXT: movzbl %ah, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -665,7 +671,7 @@ define i16 @test_i16_65024_mask_lshr_8(i16 %a0) {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $65024, %eax # imm = 0xFE00
-; X64-NEXT: shrl $8, %eax
+; X64-NEXT: movzbl %ah, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 65024
@@ -676,14 +682,16 @@ define i16 @test_i16_65024_mask_lshr_9(i16 %a0) {
; X86-LABEL: test_i16_65024_mask_lshr_9:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $9, %eax
+; X86-NEXT: andl $65024, %eax # imm = 0xFE00
+; X86-NEXT: shrw $9, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_65024_mask_lshr_9:
; X64: # %bb.0:
-; X64-NEXT: movzwl %di, %eax
-; X64-NEXT: shrl $9, %eax
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $65024, %eax # imm = 0xFE00
+; X64-NEXT: shrw $9, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 65024
@@ -716,7 +724,7 @@ define i16 @test_i16_127_mask_ashr_1(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $126, %eax
-; X86-NEXT: shrl %eax
+; X86-NEXT: sarw %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -724,7 +732,7 @@ define i16 @test_i16_127_mask_ashr_1(i16 %a0) {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $126, %eax
-; X64-NEXT: shrl %eax
+; X64-NEXT: sarw %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 127
@@ -737,7 +745,7 @@ define i16 @test_i16_2032_mask_ashr_3(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $2032, %eax # imm = 0x7F0
-; X86-NEXT: shrl $3, %eax
+; X86-NEXT: sarw $3, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -745,7 +753,7 @@ define i16 @test_i16_2032_mask_ashr_3(i16 %a0) {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $2032, %eax # imm = 0x7F0
-; X64-NEXT: shrl $3, %eax
+; X64-NEXT: sarw $3, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -756,16 +764,16 @@ define i16 @test_i16_2032_mask_ashr_4(i16 %a0) {
; X86-LABEL: test_i16_2032_mask_ashr_4:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $4, %eax
-; X86-NEXT: andl $127, %eax
+; X86-NEXT: andl $2032, %eax # imm = 0x7F0
+; X86-NEXT: sarw $4, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_2032_mask_ashr_4:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $4, %eax
-; X64-NEXT: andl $127, %eax
+; X64-NEXT: andl $2032, %eax # imm = 0x7F0
+; X64-NEXT: sarw $4, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -776,16 +784,16 @@ define i16 @test_i16_2032_mask_ashr_5(i16 %a0) {
; X86-LABEL: test_i16_2032_mask_ashr_5:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $5, %eax
-; X86-NEXT: andl $63, %eax
+; X86-NEXT: andl $2016, %eax # imm = 0x7E0
+; X86-NEXT: sarw $5, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_2032_mask_ashr_5:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $5, %eax
-; X64-NEXT: andl $63, %eax
+; X64-NEXT: andl $2016, %eax # imm = 0x7E0
+; X64-NEXT: sarw $5, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -796,16 +804,16 @@ define i16 @test_i16_2032_mask_ashr_6(i16 %a0) {
; X86-LABEL: test_i16_2032_mask_ashr_6:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $6, %eax
-; X86-NEXT: andl $31, %eax
+; X86-NEXT: andl $1984, %eax # imm = 0x7C0
+; X86-NEXT: sarw $6, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_2032_mask_ashr_6:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $6, %eax
-; X64-NEXT: andl $31, %eax
+; X64-NEXT: andl $1984, %eax # imm = 0x7C0
+; X64-NEXT: sarw $6, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -818,16 +826,15 @@ define i16 @test_i16_65024_mask_ashr_1(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $65024, %eax # imm = 0xFE00
-; X86-NEXT: cwtl
-; X86-NEXT: shrl %eax
+; X86-NEXT: sarw %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_65024_mask_ashr_1:
; X64: # %bb.0:
-; X64-NEXT: andl $65024, %edi # imm = 0xFE00
-; X64-NEXT: movswl %di, %eax
-; X64-NEXT: shrl %eax
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $65024, %eax # imm = 0xFE00
+; X64-NEXT: sarw %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 65024
@@ -839,16 +846,15 @@ define i16 @test_i16_65024_mask_ashr_8(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $65024, %eax # imm = 0xFE00
-; X86-NEXT: cwtl
-; X86-NEXT: shrl $8, %eax
+; X86-NEXT: sarw $8, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_65024_mask_ashr_8:
; X64: # %bb.0:
-; X64-NEXT: andl $65024, %edi # imm = 0xFE00
-; X64-NEXT: movswl %di, %eax
-; X64-NEXT: shrl $8, %eax
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $65024, %eax # imm = 0xFE00
+; X64-NEXT: sarw $8, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 65024
@@ -858,15 +864,17 @@ define i16 @test_i16_65024_mask_ashr_8(i16 %a0) {
define i16 @test_i16_65024_mask_ashr_9(i16 %a0) {
; X86-LABEL: test_i16_65024_mask_ashr_9:
; X86: # %bb.0:
-; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $9, %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl $65024, %eax # imm = 0xFE00
+; X86-NEXT: sarw $9, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_65024_mask_ashr_9:
; X64: # %bb.0:
-; X64-NEXT: movswl %di, %eax
-; X64-NEXT: shrl $9, %eax
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $65024, %eax # imm = 0xFE00
+; X64-NEXT: sarw $9, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 65024
@@ -899,7 +907,7 @@ define i16 @test_i16_127_mask_shl_1(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $127, %eax
-; X86-NEXT: addl %eax, %eax
+; X86-NEXT: addw %ax, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -919,7 +927,7 @@ define i16 @test_i16_127_mask_shl_8(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $127, %eax
-; X86-NEXT: shll $8, %eax
+; X86-NEXT: shlw $8, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -927,7 +935,7 @@ define i16 @test_i16_127_mask_shl_8(i16 %a0) {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $127, %eax
-; X64-NEXT: shll $8, %eax
+; X64-NEXT: shlw $8, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 127
@@ -937,15 +945,17 @@ define i16 @test_i16_127_mask_shl_8(i16 %a0) {
define i16 @test_i16_127_mask_shl_9(i16 %a0) {
; X86-LABEL: test_i16_127_mask_shl_9:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shll $9, %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl $127, %eax
+; X86-NEXT: shlw $9, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_127_mask_shl_9:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shll $9, %eax
+; X64-NEXT: andl $127, %eax
+; X64-NEXT: shlw $9, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 127
@@ -976,7 +986,7 @@ define i16 @test_i16_2032_mask_shl_3(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $2032, %eax # imm = 0x7F0
-; X86-NEXT: shll $3, %eax
+; X86-NEXT: shlw $3, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -996,7 +1006,7 @@ define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $2032, %eax # imm = 0x7F0
-; X86-NEXT: shll $4, %eax
+; X86-NEXT: shlw $4, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -1004,7 +1014,7 @@ define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $2032, %eax # imm = 0x7F0
-; X64-NEXT: shll $4, %eax
+; X64-NEXT: shlw $4, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -1016,7 +1026,7 @@ define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $2032, %eax # imm = 0x7F0
-; X86-NEXT: shll $5, %eax
+; X86-NEXT: shlw $5, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -1024,7 +1034,7 @@ define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $2032, %eax # imm = 0x7F0
-; X64-NEXT: shll $5, %eax
+; X64-NEXT: shlw $5, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -1036,7 +1046,7 @@ define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $1008, %eax # imm = 0x3F0
-; X86-NEXT: shll $6, %eax
+; X86-NEXT: shlw $6, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -1044,7 +1054,7 @@ define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $1008, %eax # imm = 0x3F0
-; X64-NEXT: shll $6, %eax
+; X64-NEXT: shlw $6, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = and i16 %a0, 2032
@@ -1057,7 +1067,7 @@ define i16 @test_i16_65024_mask_shl_1(i16 %a0) {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $32256, %eax # imm = 0x7E00
-; X86-NEXT: addl %eax, %eax
+; X86-NEXT: addw %ax, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -1210,13 +1220,15 @@ define i32 @test_i32_4294836224_mask_lshr_16(i32 %a0) {
define i32 @test_i32_4294836224_mask_lshr_17(i32 %a0) {
; X86-LABEL: test_i32_4294836224_mask_lshr_17:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $-131072, %eax # imm = 0xFFFE0000
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shrl $17, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i32_4294836224_mask_lshr_17:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $-131072, %eax # imm = 0xFFFE0000
; X64-NEXT: shrl $17, %eax
; X64-NEXT: retq
%t0 = and i32 %a0, 4294836224
@@ -1247,14 +1259,14 @@ define i32 @test_i32_32767_mask_ashr_1(i32 %a0) {
; X86: # %bb.0:
; X86-NEXT: movl $32766, %eax # imm = 0x7FFE
; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl %eax
+; X86-NEXT: sarl %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i32_32767_mask_ashr_1:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $32766, %eax # imm = 0x7FFE
-; X64-NEXT: shrl %eax
+; X64-NEXT: sarl %eax
; X64-NEXT: retq
%t0 = and i32 %a0, 32767
%t1 = ashr i32 %t0, 1
@@ -1266,14 +1278,14 @@ define i32 @test_i32_8388352_mask_ashr_7(i32 %a0) {
; X86: # %bb.0:
; X86-NEXT: movl $8388352, %eax # imm = 0x7FFF00
; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $7, %eax
+; X86-NEXT: sarl $7, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i32_8388352_mask_ashr_7:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $8388352, %eax # imm = 0x7FFF00
-; X64-NEXT: shrl $7, %eax
+; X64-NEXT: sarl $7, %eax
; X64-NEXT: retq
%t0 = and i32 %a0, 8388352
%t1 = ashr i32 %t0, 7
@@ -1284,14 +1296,14 @@ define i32 @test_i32_8388352_mask_ashr_8(i32 %a0) {
; X86: # %bb.0:
; X86-NEXT: movl $8388352, %eax # imm = 0x7FFF00
; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $8, %eax
+; X86-NEXT: sarl $8, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i32_8388352_mask_ashr_8:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $8388352, %eax # imm = 0x7FFF00
-; X64-NEXT: shrl $8, %eax
+; X64-NEXT: sarl $8, %eax
; X64-NEXT: retq
%t0 = and i32 %a0, 8388352
%t1 = ashr i32 %t0, 8
@@ -1302,14 +1314,14 @@ define i32 @test_i32_8388352_mask_ashr_9(i32 %a0) {
; X86: # %bb.0:
; X86-NEXT: movl $8388096, %eax # imm = 0x7FFE00
; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $9, %eax
+; X86-NEXT: sarl $9, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i32_8388352_mask_ashr_9:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $8388096, %eax # imm = 0x7FFE00
-; X64-NEXT: shrl $9, %eax
+; X64-NEXT: sarl $9, %eax
; X64-NEXT: retq
%t0 = and i32 %a0, 8388352
%t1 = ashr i32 %t0, 9
@@ -1320,14 +1332,14 @@ define i32 @test_i32_8388352_mask_ashr_10(i32 %a0) {
; X86: # %bb.0:
; X86-NEXT: movl $8387584, %eax # imm = 0x7FFC00
; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $10, %eax
+; X86-NEXT: sarl $10, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i32_8388352_mask_ashr_10:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $8387584, %eax # imm = 0x7FFC00
-; X64-NEXT: shrl $10, %eax
+; X64-NEXT: sarl $10, %eax
; X64-NEXT: retq
%t0 = and i32 %a0, 8388352
%t1 = ashr i32 %t0, 10
@@ -1373,13 +1385,15 @@ define i32 @test_i32_4294836224_mask_ashr_16(i32 %a0) {
define i32 @test_i32_4294836224_mask_ashr_17(i32 %a0) {
; X86-LABEL: test_i32_4294836224_mask_ashr_17:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $-131072, %eax # imm = 0xFFFE0000
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $17, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i32_4294836224_mask_ashr_17:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $-131072, %eax # imm = 0xFFFE0000
; X64-NEXT: sarl $17, %eax
; X64-NEXT: retq
%t0 = and i32 %a0, 4294836224
@@ -1444,13 +1458,15 @@ define i32 @test_i32_32767_mask_shl_16(i32 %a0) {
define i32 @test_i32_32767_mask_shl_17(i32 %a0) {
; X86-LABEL: test_i32_32767_mask_shl_17:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $32767, %eax # imm = 0x7FFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $17, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i32_32767_mask_shl_17:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $32767, %eax # imm = 0x7FFF
; X64-NEXT: shll $17, %eax
; X64-NEXT: retq
%t0 = and i32 %a0, 32767
@@ -1624,9 +1640,9 @@ define i64 @test_i64_140737488289792_mask_lshr_16(i64 %a0) {
;
; X64-LABEL: test_i64_140737488289792_mask_lshr_16:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movabsq $140737488289792, %rax # imm = 0x7FFFFFFF0000
+; X64-NEXT: andq %rdi, %rax
; X64-NEXT: shrq $16, %rax
-; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
; X64-NEXT: retq
%t0 = and i64 %a0, 140737488289792
%t1 = lshr i64 %t0, 16
@@ -1644,9 +1660,9 @@ define i64 @test_i64_140737488289792_mask_lshr_17(i64 %a0) {
;
; X64-LABEL: test_i64_140737488289792_mask_lshr_17:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movabsq $140737488224256, %rax # imm = 0x7FFFFFFE0000
+; X64-NEXT: andq %rdi, %rax
; X64-NEXT: shrq $17, %rax
-; X64-NEXT: andl $1073741823, %eax # imm = 0x3FFFFFFF
; X64-NEXT: retq
%t0 = and i64 %a0, 140737488289792
%t1 = lshr i64 %t0, 17
@@ -1664,9 +1680,9 @@ define i64 @test_i64_140737488289792_mask_lshr_18(i64 %a0) {
;
; X64-LABEL: test_i64_140737488289792_mask_lshr_18:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movabsq $140737488093184, %rax # imm = 0x7FFFFFFC0000
+; X64-NEXT: andq %rdi, %rax
; X64-NEXT: shrq $18, %rax
-; X64-NEXT: andl $536870911, %eax # imm = 0x1FFFFFFF
; X64-NEXT: retq
%t0 = and i64 %a0, 140737488289792
%t1 = lshr i64 %t0, 18
@@ -1677,6 +1693,7 @@ define i64 @test_i64_18446744065119617024_mask_lshr_1(i64 %a0) {
; X86-LABEL: test_i64_18446744065119617024_mask_lshr_1:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl $-2, %edx
; X86-NEXT: shrl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
@@ -1713,13 +1730,15 @@ define i64 @test_i64_18446744065119617024_mask_lshr_33(i64 %a0) {
; X86-LABEL: test_i64_18446744065119617024_mask_lshr_33:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl $-2, %eax
; X86-NEXT: shrl %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: test_i64_18446744065119617024_mask_lshr_33:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movabsq $-8589934592, %rax # imm = 0xFFFFFFFE00000000
+; X64-NEXT: andq %rdi, %rax
; X64-NEXT: shrq $33, %rax
; X64-NEXT: retq
%t0 = and i64 %a0, 18446744065119617024
@@ -1759,7 +1778,7 @@ define i64 @test_i64_2147483647_mask_ashr_1(i64 %a0) {
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: andl $2147483646, %eax # imm = 0x7FFFFFFE
-; X64-NEXT: shrl %eax
+; X64-NEXT: sarq %rax
; X64-NEXT: retq
%t0 = and i64 %a0, 2147483647
%t1 = ashr i64 %t0, 1
@@ -1780,7 +1799,7 @@ define i64 @test_i64_140737488289792_mask_ashr_15(i64 %a0) {
; X64: # %bb.0:
; X64-NEXT: movabsq $140737488289792, %rax # imm = 0x7FFFFFFF0000
; X64-NEXT: andq %rdi, %rax
-; X64-NEXT: shrq $15, %rax
+; X64-NEXT: sarq $15, %rax
; X64-NEXT: retq
%t0 = and i64 %a0, 140737488289792
%t1 = ashr i64 %t0, 15
@@ -1798,9 +1817,9 @@ define i64 @test_i64_140737488289792_mask_ashr_16(i64 %a0) {
;
; X64-LABEL: test_i64_140737488289792_mask_ashr_16:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $16, %rax
-; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT: movabsq $140737488289792, %rax # imm = 0x7FFFFFFF0000
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: sarq $16, %rax
; X64-NEXT: retq
%t0 = and i64 %a0, 140737488289792
%t1 = ashr i64 %t0, 16
@@ -1818,9 +1837,9 @@ define i64 @test_i64_140737488289792_mask_ashr_17(i64 %a0) {
;
; X64-LABEL: test_i64_140737488289792_mask_ashr_17:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $17, %rax
-; X64-NEXT: andl $1073741823, %eax # imm = 0x3FFFFFFF
+; X64-NEXT: movabsq $140737488224256, %rax # imm = 0x7FFFFFFE0000
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: sarq $17, %rax
; X64-NEXT: retq
%t0 = and i64 %a0, 140737488289792
%t1 = ashr i64 %t0, 17
@@ -1838,9 +1857,9 @@ define i64 @test_i64_140737488289792_mask_ashr_18(i64 %a0) {
;
; X64-LABEL: test_i64_140737488289792_mask_ashr_18:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $18, %rax
-; X64-NEXT: andl $536870911, %eax # imm = 0x1FFFFFFF
+; X64-NEXT: movabsq $140737488093184, %rax # imm = 0x7FFFFFFC0000
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: sarq $18, %rax
; X64-NEXT: retq
%t0 = and i64 %a0, 140737488289792
%t1 = ashr i64 %t0, 18
@@ -1851,6 +1870,7 @@ define i64 @test_i64_18446744065119617024_mask_ashr_1(i64 %a0) {
; X86-LABEL: test_i64_18446744065119617024_mask_ashr_1:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl $-2, %edx
; X86-NEXT: sarl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
@@ -1889,13 +1909,15 @@ define i64 @test_i64_18446744065119617024_mask_ashr_33(i64 %a0) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: sarl %eax
+; X86-NEXT: andl $-2, %eax
; X86-NEXT: sarl $31, %edx
+; X86-NEXT: sarl %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i64_18446744065119617024_mask_ashr_33:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movabsq $-8589934592, %rax # imm = 0xFFFFFFFE00000000
+; X64-NEXT: andq %rdi, %rax
; X64-NEXT: sarq $33, %rax
; X64-NEXT: retq
%t0 = and i64 %a0, 18446744065119617024
@@ -1926,14 +1948,16 @@ define i64 @test_i64_18446744065119617024_mask_ashr_34(i64 %a0) {
define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) {
; X86-LABEL: test_i64_2147483647_mask_shl_1:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: test_i64_2147483647_mask_shl_1:
; X64: # %bb.0:
-; X64-NEXT: leal (%rdi,%rdi), %eax
+; X64-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF
+; X64-NEXT: leaq (%rdi,%rdi), %rax
; X64-NEXT: retq
%t0 = and i64 %a0, 2147483647
%t1 = shl i64 %t0, 1
@@ -1960,7 +1984,8 @@ define i64 @test_i64_2147483647_mask_shl_32(i64 %a0) {
define i64 @test_i64_2147483647_mask_shl_33(i64 %a0) {
; X86-LABEL: test_i64_2147483647_mask_shl_33:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %edx
; X86-NEXT: addl %edx, %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
@@ -1968,6 +1993,7 @@ define i64 @test_i64_2147483647_mask_shl_33(i64 %a0) {
; X64-LABEL: test_i64_2147483647_mask_shl_33:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
; X64-NEXT: shlq $33, %rax
; X64-NEXT: retq
%t0 = and i64 %a0, 2147483647
diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index a43dba94d30c78..1db43be7fbd18a 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -150,6 +150,8 @@ define i16 @ctpop_shifted_mask3(i16 %x) nounwind readnone {
; X86-NO-POPCOUNT: # %bb.0:
; X86-NO-POPCOUNT-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NO-POPCOUNT-NEXT: andl $14, %ecx
+; X86-NO-POPCOUNT-NEXT: shrw %cx
+; X86-NO-POPCOUNT-NEXT: addl %ecx, %ecx
; X86-NO-POPCOUNT-NEXT: movl $59796, %eax # imm = 0xE994
; X86-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-POPCOUNT-NEXT: shrl %cl, %eax
@@ -159,8 +161,10 @@ define i16 @ctpop_shifted_mask3(i16 %x) nounwind readnone {
;
; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask3:
; X64-NO-POPCOUNT: # %bb.0:
-; X64-NO-POPCOUNT-NEXT: movl %edi, %ecx
-; X64-NO-POPCOUNT-NEXT: andl $14, %ecx
+; X64-NO-POPCOUNT-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NO-POPCOUNT-NEXT: andl $14, %edi
+; X64-NO-POPCOUNT-NEXT: shrw %di
+; X64-NO-POPCOUNT-NEXT: leal (%rdi,%rdi), %ecx
; X64-NO-POPCOUNT-NEXT: movl $59796, %eax # imm = 0xE994
; X64-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-POPCOUNT-NEXT: shrl %cl, %eax
@@ -229,9 +233,9 @@ define i32 @ctpop_shifted_mask4(i32 %x) nounwind readnone {
;
; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask4:
; X86-NO-POPCOUNT: # %bb.0:
-; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT: movl $7680, %eax # imm = 0x1E00
+; X86-NO-POPCOUNT-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NO-POPCOUNT-NEXT: shrl $9, %eax
-; X86-NO-POPCOUNT-NEXT: andl $15, %eax
; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201
; X86-NO-POPCOUNT-NEXT: shrl $3, %eax
; X86-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111
@@ -241,9 +245,10 @@ define i32 @ctpop_shifted_mask4(i32 %x) nounwind readnone {
;
; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask4:
; X64-NO-POPCOUNT: # %bb.0:
-; X64-NO-POPCOUNT-NEXT: movl %edi, %ecx
-; X64-NO-POPCOUNT-NEXT: shrl $7, %ecx
-; X64-NO-POPCOUNT-NEXT: andl $60, %ecx
+; X64-NO-POPCOUNT-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NO-POPCOUNT-NEXT: andl $7680, %edi # imm = 0x1E00
+; X64-NO-POPCOUNT-NEXT: shrl $9, %edi
+; X64-NO-POPCOUNT-NEXT: leal (,%rdi,4), %ecx
; X64-NO-POPCOUNT-NEXT: movabsq $4841987667533046032, %rax # imm = 0x4332322132212110
; X64-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-POPCOUNT-NEXT: shrq %cl, %rax
@@ -312,9 +317,9 @@ define i32 @ctpop_shifted_mask5(i32 %x) nounwind readnone {
;
; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask5:
; X86-NO-POPCOUNT: # %bb.0:
-; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT: movl $11776, %eax # imm = 0x2E00
+; X86-NO-POPCOUNT-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NO-POPCOUNT-NEXT: shrl $9, %eax
-; X86-NO-POPCOUNT-NEXT: andl $23, %eax
; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201
; X86-NO-POPCOUNT-NEXT: shrl $3, %eax
; X86-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111
@@ -324,8 +329,8 @@ define i32 @ctpop_shifted_mask5(i32 %x) nounwind readnone {
;
; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask5:
; X64-NO-POPCOUNT: # %bb.0:
+; X64-NO-POPCOUNT-NEXT: andl $11776, %edi # imm = 0x2E00
; X64-NO-POPCOUNT-NEXT: shrl $9, %edi
-; X64-NO-POPCOUNT-NEXT: andl $23, %edi
; X64-NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201
; X64-NO-POPCOUNT-NEXT: shrl $3, %eax
; X64-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111
@@ -393,9 +398,9 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone {
;
; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
; X86-NO-POPCOUNT: # %bb.0:
-; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT: movl $26112, %eax # imm = 0x6600
+; X86-NO-POPCOUNT-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NO-POPCOUNT-NEXT: shrl $9, %eax
-; X86-NO-POPCOUNT-NEXT: andl $51, %eax
; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201
; X86-NO-POPCOUNT-NEXT: shrl $3, %eax
; X86-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111
@@ -406,8 +411,8 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone {
;
; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
; X64-NO-POPCOUNT: # %bb.0:
+; X64-NO-POPCOUNT-NEXT: andl $26112, %edi # imm = 0x6600
; X64-NO-POPCOUNT-NEXT: shrl $9, %edi
-; X64-NO-POPCOUNT-NEXT: andl $51, %edi
; X64-NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201
; X64-NO-POPCOUNT-NEXT: shrl $3, %eax
; X64-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111
@@ -478,9 +483,9 @@ define i32 @ctpop_shift_mask7(i32 %x) nounwind readnone {
;
; X86-NO-POPCOUNT-LABEL: ctpop_shift_mask7:
; X86-NO-POPCOUNT: # %bb.0:
-; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT: movl $1040384, %eax # imm = 0xFE000
+; X86-NO-POPCOUNT-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NO-POPCOUNT-NEXT: shrl $13, %eax
-; X86-NO-POPCOUNT-NEXT: andl $127, %eax
; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201
; X86-NO-POPCOUNT-NEXT: shrl $3, %eax
; X86-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111
@@ -490,8 +495,8 @@ define i32 @ctpop_shift_mask7(i32 %x) nounwind readnone {
;
; X64-NO-POPCOUNT-LABEL: ctpop_shift_mask7:
; X64-NO-POPCOUNT: # %bb.0:
+; X64-NO-POPCOUNT-NEXT: andl $1040384, %edi # imm = 0xFE000
; X64-NO-POPCOUNT-NEXT: shrl $13, %edi
-; X64-NO-POPCOUNT-NEXT: andl $127, %edi
; X64-NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201
; X64-NO-POPCOUNT-NEXT: shrl $3, %eax
; X64-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111
diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll
index 1380c02663ee0e..f7d16f4b7b9653 100644
--- a/llvm/test/CodeGen/X86/dagcombine-select.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-select.ll
@@ -180,10 +180,9 @@ define i32 @urem_constant_sel_constants(i1 %cond) {
define i32 @sel_constants_shl_constant(i1 %cond) {
; CHECK-LABEL: sel_constants_shl_constant:
; CHECK: # %bb.0:
-; CHECK-NEXT: notb %dil
-; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: orl $2, %eax
+; CHECK-NEXT: xorl $3, %eax
; CHECK-NEXT: shll $8, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 2, i32 3
@@ -194,10 +193,12 @@ define i32 @sel_constants_shl_constant(i1 %cond) {
define i32 @shl_constant_sel_constants(i1 %cond) {
; CHECK-LABEL: shl_constant_sel_constants:
; CHECK: # %bb.0:
-; CHECK-NEXT: notb %dil
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: leal 4(,%rax,4), %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorb $3, %cl
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: shll %cl, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 2, i32 3
%bo = shl i32 1, %sel
@@ -207,10 +208,12 @@ define i32 @shl_constant_sel_constants(i1 %cond) {
define i32 @shl_constant_sel_setcc(i32 %a) {
; CHECK-LABEL: shl_constant_sel_setcc:
; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: leal 4(,%rax,4), %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorb $3, %cl
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: shll %cl, %eax
; CHECK-NEXT: retq
%m = and i32 %a, 1
%cond = icmp ne i32 %m, 0
@@ -222,9 +225,12 @@ define i32 @shl_constant_sel_setcc(i32 %a) {
define i32 @lshr_constant_sel_constants(i1 %cond) {
; CHECK-LABEL: lshr_constant_sel_constants:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: leal 8(,%rdi,8), %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorb $3, %cl
+; CHECK-NEXT: movl $64, %eax
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: shrl %cl, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 2, i32 3
%bo = lshr i32 64, %sel
@@ -234,9 +240,12 @@ define i32 @lshr_constant_sel_constants(i1 %cond) {
define i32 @lshr_constant_sel_setcc(i32 %a) {
; CHECK-LABEL: lshr_constant_sel_setcc:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: leal 8(,%rdi,8), %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorb $3, %cl
+; CHECK-NEXT: movl $64, %eax
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: shrl %cl, %eax
; CHECK-NEXT: retq
%m = and i32 %a, 1
%cond = icmp ne i32 %m, 0
@@ -248,10 +257,12 @@ define i32 @lshr_constant_sel_setcc(i32 %a) {
define i32 @ashr_constant_sel_constants(i1 %cond) {
; CHECK-LABEL: ashr_constant_sel_constants:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: shll $4, %edi
-; CHECK-NEXT: leal 16(%rdi), %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorb $3, %cl
+; CHECK-NEXT: movl $128, %eax
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: sarl %cl, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 2, i32 3
%bo = ashr i32 128, %sel
@@ -261,10 +272,12 @@ define i32 @ashr_constant_sel_constants(i1 %cond) {
define i32 @ashr_constant_sel_setcc(i32 %a) {
; CHECK-LABEL: ashr_constant_sel_setcc:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: shll $4, %edi
-; CHECK-NEXT: leal 16(%rdi), %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorb $3, %cl
+; CHECK-NEXT: movl $128, %eax
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: sarl %cl, %eax
; CHECK-NEXT: retq
%m = and i32 %a, 1
%cond = icmp ne i32 %m, 0
diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
index 734abfe55a4ec4..f8de4a6100b493 100644
--- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
@@ -15,14 +15,17 @@ define i16 @fun1(i8 zeroext %v) {
; X86-LABEL: fun1:
; X86: # %bb.0: # %entry
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $-16, %eax
+; X86-NEXT: shrb $4, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: shlw $4, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: fun1:
; X64: # %bb.0: # %entry
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $-16, %eax
+; X64-NEXT: shrb $4, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: shlw $4, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -36,13 +39,16 @@ define i32 @fun2(i8 zeroext %v) {
; X86-LABEL: fun2:
; X86: # %bb.0: # %entry
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $-16, %eax
+; X86-NEXT: shrb $4, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: shll $4, %eax
; X86-NEXT: retl
;
; X64-LABEL: fun2:
; X64: # %bb.0: # %entry
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $-16, %eax
+; X64-NEXT: shrb $4, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: shll $4, %eax
; X64-NEXT: retq
entry:
%shr = lshr i8 %v, 4
@@ -55,13 +61,15 @@ define i32 @fun3(i16 zeroext %v) {
; X86-LABEL: fun3:
; X86: # %bb.0: # %entry
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $-16, %eax
+; X86-NEXT: shrl $4, %eax
+; X86-NEXT: shll $4, %eax
; X86-NEXT: retl
;
; X64-LABEL: fun3:
; X64: # %bb.0: # %entry
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $-16, %eax
+; X64-NEXT: shrl $4, %eax
+; X64-NEXT: shll $4, %eax
; X64-NEXT: retq
entry:
%shr = lshr i16 %v, 4
@@ -74,14 +82,17 @@ define i64 @fun4(i8 zeroext %v) {
; X86-LABEL: fun4:
; X86: # %bb.0: # %entry
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $-16, %eax
+; X86-NEXT: shrb $4, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: shll $4, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: fun4:
; X64: # %bb.0: # %entry
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $-16, %eax
+; X64-NEXT: shrb $4, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: shlq $4, %rax
; X64-NEXT: retq
entry:
%shr = lshr i8 %v, 4
@@ -94,14 +105,16 @@ define i64 @fun5(i16 zeroext %v) {
; X86-LABEL: fun5:
; X86: # %bb.0: # %entry
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $-16, %eax
+; X86-NEXT: shrl $4, %eax
+; X86-NEXT: shll $4, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: fun5:
; X64: # %bb.0: # %entry
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $-16, %eax
+; X64-NEXT: shrl $4, %eax
+; X64-NEXT: shlq $4, %rax
; X64-NEXT: retq
entry:
%shr = lshr i16 %v, 4
@@ -114,14 +127,16 @@ define i64 @fun6(i32 zeroext %v) {
; X86-LABEL: fun6:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $-16, %eax
+; X86-NEXT: shrl $4, %eax
+; X86-NEXT: shll $4, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: fun6:
; X64: # %bb.0: # %entry
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $-16, %eax
+; X64-NEXT: shrl $4, %eax
+; X64-NEXT: shlq $4, %rax
; X64-NEXT: retq
entry:
%shr = lshr i32 %v, 4
@@ -146,7 +161,7 @@ define i64 @fun7(i8 zeroext %v) {
; X64: # %bb.0: # %entry
; X64-NEXT: sarb $4, %dil
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: shll $4, %eax
+; X64-NEXT: shlq $4, %rax
; X64-NEXT: retq
entry:
%shr = ashr i8 %v, 4
@@ -159,14 +174,18 @@ define i64 @fun8(i16 zeroext %v) {
; X86-LABEL: fun8:
; X86: # %bb.0: # %entry
; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $1048560, %eax # imm = 0xFFFF0
+; X86-NEXT: shrl $4, %eax
+; X86-NEXT: movzwl %ax, %eax
+; X86-NEXT: shll $4, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: fun8:
; X64: # %bb.0: # %entry
; X64-NEXT: movswl %di, %eax
-; X64-NEXT: andl $1048560, %eax # imm = 0xFFFF0
+; X64-NEXT: shrl $4, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: shlq $4, %rax
; X64-NEXT: retq
entry:
%shr = ashr i16 %v, 4
@@ -178,10 +197,10 @@ entry:
define i64 @fun9(i32 zeroext %v) {
; X86-LABEL: fun9:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: sarl $4, %edx
-; X86-NEXT: andl $-16, %eax
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shll $4, %eax
; X86-NEXT: shrl $28, %edx
; X86-NEXT: retl
;
@@ -215,12 +234,11 @@ define i64 @fun10(i8 zeroext %v) {
;
; X64-LABEL: fun10:
; X64: # %bb.0: # %entry
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shrb $4, %dil
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrb $4, %al
+; X64-NEXT: shlb $4, %al
+; X64-NEXT: orb %dil, %al
; X64-NEXT: movzbl %al, %eax
-; X64-NEXT: andl $-16, %edi
-; X64-NEXT: orq %rdi, %rax
; X64-NEXT: retq
entry:
%shr = lshr i8 %v, 4
@@ -233,10 +251,10 @@ entry:
define i64 @fun11(i16 zeroext %v) {
; X86-LABEL: fun11:
; X86: # %bb.0: # %entry
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shrl $4, %ecx
-; X86-NEXT: andl $-16, %eax
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shll $4, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
@@ -244,9 +262,10 @@ define i64 @fun11(i16 zeroext %v) {
; X64-LABEL: fun11:
; X64: # %bb.0: # %entry
; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shrl $4, %edi
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $4, %eax
-; X64-NEXT: andl $-16, %edi
+; X64-NEXT: shlw $4, %ax
+; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: addq %rdi, %rax
; X64-NEXT: retq
entry:
@@ -260,10 +279,10 @@ entry:
define i64 @fun12(i32 zeroext %v) {
; X86-LABEL: fun12:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shrl $4, %ecx
-; X86-NEXT: andl $-16, %eax
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shll $4, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: setb %dl
@@ -301,7 +320,8 @@ define void @g(i32 %a) nounwind {
; X86: # %bb.0:
; X86-NEXT: subl $12, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $-4, %eax
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: shll $2, %eax
; X86-NEXT: subl $8, %esp
; X86-NEXT: pushl $0
; X86-NEXT: pushl %eax
@@ -312,7 +332,8 @@ define void @g(i32 %a) nounwind {
; X64-LABEL: g:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $-4, %edi
+; X64-NEXT: shrl $2, %edi
+; X64-NEXT: shlq $2, %rdi
; X64-NEXT: jmp f # TAILCALL
%b = lshr i32 %a, 2
%c = zext i32 %b to i64
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index ac78136b9d8eac..ed87c0e65a9904 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -553,11 +553,11 @@ define i64 @urem_i64_17(i64 %x) nounwind {
; X86-NEXT: movl $-252645135, %edx # imm = 0xF0F0F0F1
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-16, %eax
; X86-NEXT: shrl $4, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shll $4, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
@@ -567,11 +567,11 @@ define i64 @urem_i64_17(i64 %x) nounwind {
; X64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq $-16, %rax
; X64-NEXT: shrq $4, %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: shlq $4, %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
entry:
@@ -626,11 +626,11 @@ define i64 @urem_i64_257(i64 %x) nounwind {
; X86-NEXT: movl $-16711935, %edx # imm = 0xFF00FF01
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-256, %eax
; X86-NEXT: shrl $8, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
@@ -640,11 +640,11 @@ define i64 @urem_i64_257(i64 %x) nounwind {
; X64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq $-256, %rax
; X64-NEXT: shrq $8, %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: shlq $8, %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
entry:
@@ -712,11 +712,11 @@ define i64 @urem_i64_65537(i64 %x) nounwind {
; X64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
; X64-NEXT: shrq $16, %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: shlq $16, %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
entry:
@@ -752,8 +752,8 @@ define i64 @urem_i64_12(i64 %x) nounwind {
; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq %rdx
-; X64-NEXT: andq $-4, %rdx
+; X64-NEXT: shrq $3, %rdx
+; X64-NEXT: shlq $2, %rdx
; X64-NEXT: leaq (%rdx,%rdx,2), %rax
; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
@@ -906,11 +906,11 @@ define i64 @udiv_i64_17(i64 %x) nounwind {
; X86-NEXT: movl $-252645135, %ebx # imm = 0xF0F0F0F1
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-16, %eax
; X86-NEXT: shrl $4, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shll $4, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
; X86-NEXT: movl %ecx, %eax
@@ -994,11 +994,11 @@ define i64 @udiv_i64_257(i64 %x) nounwind {
; X86-NEXT: movl $-16711935, %ebx # imm = 0xFF00FF01
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-256, %eax
; X86-NEXT: shrl $8, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
; X86-NEXT: movl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 3796dd796eaf9d..1ba39ecfb0ec8f 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -239,11 +239,11 @@ define i128 @urem_i128_17(i128 %x) nounwind {
; X86-64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-16, %rax
; X86-64-NEXT: shrq $4, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rdi
+; X86-64-NEXT: movq %rdx, %rax
+; X86-64-NEXT: shlq $4, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
@@ -255,11 +255,11 @@ define i128 @urem_i128_17(i128 %x) nounwind {
; WIN64-NEXT: movabsq $-1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F1
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-16, %rax
; WIN64-NEXT: shrq $4, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: shlq $4, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
@@ -316,11 +316,11 @@ define i128 @urem_i128_257(i128 %x) nounwind {
; X86-64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-256, %rax
; X86-64-NEXT: shrq $8, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rdi
+; X86-64-NEXT: movq %rdx, %rax
+; X86-64-NEXT: shlq $8, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
@@ -332,11 +332,11 @@ define i128 @urem_i128_257(i128 %x) nounwind {
; WIN64-NEXT: movabsq $-71777214294589695, %rdx # imm = 0xFF00FF00FF00FF01
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-256, %rax
; WIN64-NEXT: shrq $8, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: shlq $8, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
@@ -393,11 +393,11 @@ define i128 @urem_i128_65537(i128 %x) nounwind {
; X86-64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
; X86-64-NEXT: shrq $16, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rdi
+; X86-64-NEXT: movq %rdx, %rax
+; X86-64-NEXT: shlq $16, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
@@ -409,11 +409,11 @@ define i128 @urem_i128_65537(i128 %x) nounwind {
; WIN64-NEXT: movabsq $-281470681808895, %rdx # imm = 0xFFFF0000FFFF0001
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
; WIN64-NEXT: shrq $16, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: shlq $16, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
@@ -626,11 +626,11 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
; X86-64-NEXT: movabsq $-1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F1
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: mulq %r8
-; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-16, %rax
; X86-64-NEXT: shrq $4, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rcx
+; X86-64-NEXT: movq %rdx, %rax
+; X86-64-NEXT: shlq $4, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
@@ -651,11 +651,11 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
; WIN64-NEXT: movabsq $-1085102592571150095, %r10 # imm = 0xF0F0F0F0F0F0F0F1
; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-16, %rax
; WIN64-NEXT: shrq $4, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: shlq $4, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-1085102592571150096, %r9 # imm = 0xF0F0F0F0F0F0F0F0
@@ -738,11 +738,11 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
; X86-64-NEXT: movabsq $-71777214294589695, %r8 # imm = 0xFF00FF00FF00FF01
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: mulq %r8
-; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-256, %rax
; X86-64-NEXT: shrq $8, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rcx
+; X86-64-NEXT: movq %rdx, %rax
+; X86-64-NEXT: shlq $8, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-71777214294589696, %rcx # imm = 0xFF00FF00FF00FF00
@@ -763,11 +763,11 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
; WIN64-NEXT: movabsq $-71777214294589695, %r10 # imm = 0xFF00FF00FF00FF01
; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-256, %rax
; WIN64-NEXT: shrq $8, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: shlq $8, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-71777214294589696, %r9 # imm = 0xFF00FF00FF00FF00
@@ -850,11 +850,11 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
; X86-64-NEXT: movabsq $-281470681808895, %r8 # imm = 0xFFFF0000FFFF0001
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: mulq %r8
-; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
; X86-64-NEXT: shrq $16, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rcx
+; X86-64-NEXT: movq %rdx, %rax
+; X86-64-NEXT: shlq $16, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-281470681808896, %rcx # imm = 0xFFFF0000FFFF0000
@@ -875,11 +875,11 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
; WIN64-NEXT: movabsq $-281470681808895, %r10 # imm = 0xFFFF0000FFFF0001
; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
; WIN64-NEXT: shrq $16, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: shlq $16, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-281470681808896, %r9 # imm = 0xFFFF0000FFFF0000
diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll
index 90e075bfabf0a2..ceef7960626c6e 100644
--- a/llvm/test/CodeGen/X86/extract-bits.ll
+++ b/llvm/test/CodeGen/X86/extract-bits.ll
@@ -8066,9 +8066,9 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind {
; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NOBMI-NEXT: movl (%ecx), %ecx
-; X86-NOBMI-NEXT: shrl $19, %ecx
-; X86-NOBMI-NEXT: andl $4092, %ecx # imm = 0xFFC
-; X86-NOBMI-NEXT: incl (%eax,%ecx)
+; X86-NOBMI-NEXT: shrl $21, %ecx
+; X86-NOBMI-NEXT: andl $1023, %ecx # imm = 0x3FF
+; X86-NOBMI-NEXT: incl (%eax,%ecx,4)
; X86-NOBMI-NEXT: retl
;
; X86-BMINOTBM-LABEL: pr38938:
@@ -8091,9 +8091,9 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind {
; X64-NOBMI-LABEL: pr38938:
; X64-NOBMI: # %bb.0:
; X64-NOBMI-NEXT: movl (%rsi), %eax
-; X64-NOBMI-NEXT: shrl $19, %eax
-; X64-NOBMI-NEXT: andl $4092, %eax # imm = 0xFFC
-; X64-NOBMI-NEXT: incl (%rdi,%rax)
+; X64-NOBMI-NEXT: shrl $21, %eax
+; X64-NOBMI-NEXT: andl $1023, %eax # imm = 0x3FF
+; X64-NOBMI-NEXT: incl (%rdi,%rax,4)
; X64-NOBMI-NEXT: retq
;
; X64-BMINOTBM-LABEL: pr38938:
@@ -8182,19 +8182,47 @@ define i32 @c1_i32(i32 %arg) nounwind {
; Should be still fine, but the result is shifted left afterwards
define i32 @c2_i32(i32 %arg) nounwind {
-; X86-LABEL: c2_i32:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $17, %eax
-; X86-NEXT: andl $4092, %eax # imm = 0xFFC
-; X86-NEXT: retl
+; X86-NOBMI-LABEL: c2_i32:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: shrl $19, %eax
+; X86-NOBMI-NEXT: andl $1023, %eax # imm = 0x3FF
+; X86-NOBMI-NEXT: shll $2, %eax
+; X86-NOBMI-NEXT: retl
;
-; X64-LABEL: c2_i32:
-; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $17, %eax
-; X64-NEXT: andl $4092, %eax # imm = 0xFFC
-; X64-NEXT: retq
+; X86-BMINOTBM-LABEL: c2_i32:
+; X86-BMINOTBM: # %bb.0:
+; X86-BMINOTBM-NEXT: movl $2579, %eax # imm = 0xA13
+; X86-BMINOTBM-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMINOTBM-NEXT: shll $2, %eax
+; X86-BMINOTBM-NEXT: retl
+;
+; X86-BMITBM-LABEL: c2_i32:
+; X86-BMITBM: # %bb.0:
+; X86-BMITBM-NEXT: bextrl $2579, {{[0-9]+}}(%esp), %eax # imm = 0xA13
+; X86-BMITBM-NEXT: shll $2, %eax
+; X86-BMITBM-NEXT: retl
+;
+; X64-NOBMI-LABEL: c2_i32:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NOBMI-NEXT: shrl $19, %edi
+; X64-NOBMI-NEXT: andl $1023, %edi # imm = 0x3FF
+; X64-NOBMI-NEXT: leal (,%rdi,4), %eax
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMINOTBM-LABEL: c2_i32:
+; X64-BMINOTBM: # %bb.0:
+; X64-BMINOTBM-NEXT: movl $2579, %eax # imm = 0xA13
+; X64-BMINOTBM-NEXT: bextrl %eax, %edi, %eax
+; X64-BMINOTBM-NEXT: shll $2, %eax
+; X64-BMINOTBM-NEXT: retq
+;
+; X64-BMITBM-LABEL: c2_i32:
+; X64-BMITBM: # %bb.0:
+; X64-BMITBM-NEXT: bextrl $2579, %edi, %eax # imm = 0xA13
+; X64-BMITBM-NEXT: shll $2, %eax
+; X64-BMITBM-NEXT: retq
%tmp0 = lshr i32 %arg, 19
%tmp1 = and i32 %tmp0, 1023
%tmp2 = shl i32 %tmp1, 2
@@ -8291,20 +8319,49 @@ define i64 @c1_i64(i64 %arg) nounwind {
; Should be still fine, but the result is shifted left afterwards
define i64 @c2_i64(i64 %arg) nounwind {
-; X86-LABEL: c2_i64:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $17, %eax
-; X86-NEXT: andl $4092, %eax # imm = 0xFFC
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: retl
+; X86-NOBMI-LABEL: c2_i64:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: shrl $19, %eax
+; X86-NOBMI-NEXT: andl $1023, %eax # imm = 0x3FF
+; X86-NOBMI-NEXT: shll $2, %eax
+; X86-NOBMI-NEXT: xorl %edx, %edx
+; X86-NOBMI-NEXT: retl
;
-; X64-LABEL: c2_i64:
-; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $49, %rax
-; X64-NEXT: andl $4092, %eax # imm = 0xFFC
-; X64-NEXT: retq
+; X86-BMINOTBM-LABEL: c2_i64:
+; X86-BMINOTBM: # %bb.0:
+; X86-BMINOTBM-NEXT: movl $2579, %eax # imm = 0xA13
+; X86-BMINOTBM-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMINOTBM-NEXT: shll $2, %eax
+; X86-BMINOTBM-NEXT: xorl %edx, %edx
+; X86-BMINOTBM-NEXT: retl
+;
+; X86-BMITBM-LABEL: c2_i64:
+; X86-BMITBM: # %bb.0:
+; X86-BMITBM-NEXT: bextrl $2579, {{[0-9]+}}(%esp), %eax # imm = 0xA13
+; X86-BMITBM-NEXT: shll $2, %eax
+; X86-BMITBM-NEXT: xorl %edx, %edx
+; X86-BMITBM-NEXT: retl
+;
+; X64-NOBMI-LABEL: c2_i64:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: shrq $51, %rdi
+; X64-NOBMI-NEXT: andl $1023, %edi # imm = 0x3FF
+; X64-NOBMI-NEXT: leaq (,%rdi,4), %rax
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMINOTBM-LABEL: c2_i64:
+; X64-BMINOTBM: # %bb.0:
+; X64-BMINOTBM-NEXT: movl $2611, %eax # imm = 0xA33
+; X64-BMINOTBM-NEXT: bextrq %rax, %rdi, %rax
+; X64-BMINOTBM-NEXT: shlq $2, %rax
+; X64-BMINOTBM-NEXT: retq
+;
+; X64-BMITBM-LABEL: c2_i64:
+; X64-BMITBM: # %bb.0:
+; X64-BMITBM-NEXT: bextrq $2611, %rdi, %rax # imm = 0xA33
+; X64-BMITBM-NEXT: shlq $2, %rax
+; X64-BMITBM-NEXT: retq
%tmp0 = lshr i64 %arg, 51
%tmp1 = and i64 %tmp0, 1023
%tmp2 = shl i64 %tmp1, 2
@@ -8442,21 +8499,55 @@ define void @c6_i32(i32 %arg, ptr %ptr) nounwind {
; Should be still fine, but the result is shifted left afterwards
define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
-; X86-LABEL: c7_i32:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shrl $17, %ecx
-; X86-NEXT: andl $4092, %ecx # imm = 0xFFC
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: retl
+; X86-NOBMI-LABEL: c7_i32:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: shrl $19, %ecx
+; X86-NOBMI-NEXT: andl $1023, %ecx # imm = 0x3FF
+; X86-NOBMI-NEXT: shll $2, %ecx
+; X86-NOBMI-NEXT: movl %ecx, (%eax)
+; X86-NOBMI-NEXT: retl
;
-; X64-LABEL: c7_i32:
-; X64: # %bb.0:
-; X64-NEXT: shrl $17, %edi
-; X64-NEXT: andl $4092, %edi # imm = 0xFFC
-; X64-NEXT: movl %edi, (%rsi)
-; X64-NEXT: retq
+; X86-BMINOTBM-LABEL: c7_i32:
+; X86-BMINOTBM: # %bb.0:
+; X86-BMINOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMINOTBM-NEXT: movl $2579, %ecx # imm = 0xA13
+; X86-BMINOTBM-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMINOTBM-NEXT: shll $2, %ecx
+; X86-BMINOTBM-NEXT: movl %ecx, (%eax)
+; X86-BMINOTBM-NEXT: retl
+;
+; X86-BMITBM-LABEL: c7_i32:
+; X86-BMITBM: # %bb.0:
+; X86-BMITBM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMITBM-NEXT: bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13
+; X86-BMITBM-NEXT: shll $2, %ecx
+; X86-BMITBM-NEXT: movl %ecx, (%eax)
+; X86-BMITBM-NEXT: retl
+;
+; X64-NOBMI-LABEL: c7_i32:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: shrl $19, %edi
+; X64-NOBMI-NEXT: andl $1023, %edi # imm = 0x3FF
+; X64-NOBMI-NEXT: shll $2, %edi
+; X64-NOBMI-NEXT: movl %edi, (%rsi)
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMINOTBM-LABEL: c7_i32:
+; X64-BMINOTBM: # %bb.0:
+; X64-BMINOTBM-NEXT: movl $2579, %eax # imm = 0xA13
+; X64-BMINOTBM-NEXT: bextrl %eax, %edi, %eax
+; X64-BMINOTBM-NEXT: shll $2, %eax
+; X64-BMINOTBM-NEXT: movl %eax, (%rsi)
+; X64-BMINOTBM-NEXT: retq
+;
+; X64-BMITBM-LABEL: c7_i32:
+; X64-BMITBM: # %bb.0:
+; X64-BMITBM-NEXT: bextrl $2579, %edi, %eax # imm = 0xA13
+; X64-BMITBM-NEXT: shll $2, %eax
+; X64-BMITBM-NEXT: movl %eax, (%rsi)
+; X64-BMITBM-NEXT: retq
%tmp0 = lshr i32 %arg, 19
%tmp1 = and i32 %tmp0, 1023
%tmp2 = shl i32 %tmp1, 2
@@ -8576,22 +8667,58 @@ define void @c6_i64(i64 %arg, ptr %ptr) nounwind {
; Should be still fine, but the result is shifted left afterwards
define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
-; X86-LABEL: c7_i64:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shrl $17, %ecx
-; X86-NEXT: andl $4092, %ecx # imm = 0xFFC
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl $0, 4(%eax)
-; X86-NEXT: retl
+; X86-NOBMI-LABEL: c7_i64:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: shrl $19, %ecx
+; X86-NOBMI-NEXT: andl $1023, %ecx # imm = 0x3FF
+; X86-NOBMI-NEXT: shll $2, %ecx
+; X86-NOBMI-NEXT: movl %ecx, (%eax)
+; X86-NOBMI-NEXT: movl $0, 4(%eax)
+; X86-NOBMI-NEXT: retl
;
-; X64-LABEL: c7_i64:
-; X64: # %bb.0:
-; X64-NEXT: shrq $49, %rdi
-; X64-NEXT: andl $4092, %edi # imm = 0xFFC
-; X64-NEXT: movq %rdi, (%rsi)
-; X64-NEXT: retq
+; X86-BMINOTBM-LABEL: c7_i64:
+; X86-BMINOTBM: # %bb.0:
+; X86-BMINOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMINOTBM-NEXT: movl $2579, %ecx # imm = 0xA13
+; X86-BMINOTBM-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMINOTBM-NEXT: shll $2, %ecx
+; X86-BMINOTBM-NEXT: movl %ecx, (%eax)
+; X86-BMINOTBM-NEXT: movl $0, 4(%eax)
+; X86-BMINOTBM-NEXT: retl
+;
+; X86-BMITBM-LABEL: c7_i64:
+; X86-BMITBM: # %bb.0:
+; X86-BMITBM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMITBM-NEXT: bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13
+; X86-BMITBM-NEXT: shll $2, %ecx
+; X86-BMITBM-NEXT: movl %ecx, (%eax)
+; X86-BMITBM-NEXT: movl $0, 4(%eax)
+; X86-BMITBM-NEXT: retl
+;
+; X64-NOBMI-LABEL: c7_i64:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: shrq $51, %rdi
+; X64-NOBMI-NEXT: andl $1023, %edi # imm = 0x3FF
+; X64-NOBMI-NEXT: shlq $2, %rdi
+; X64-NOBMI-NEXT: movq %rdi, (%rsi)
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMINOTBM-LABEL: c7_i64:
+; X64-BMINOTBM: # %bb.0:
+; X64-BMINOTBM-NEXT: movl $2611, %eax # imm = 0xA33
+; X64-BMINOTBM-NEXT: bextrq %rax, %rdi, %rax
+; X64-BMINOTBM-NEXT: shlq $2, %rax
+; X64-BMINOTBM-NEXT: movq %rax, (%rsi)
+; X64-BMINOTBM-NEXT: retq
+;
+; X64-BMITBM-LABEL: c7_i64:
+; X64-BMITBM: # %bb.0:
+; X64-BMITBM-NEXT: bextrq $2611, %rdi, %rax # imm = 0xA33
+; X64-BMITBM-NEXT: shlq $2, %rax
+; X64-BMITBM-NEXT: movq %rax, (%rsi)
+; X64-BMITBM-NEXT: retq
%tmp0 = lshr i64 %arg, 51
%tmp1 = and i64 %tmp0, 1023
%tmp2 = shl i64 %tmp1, 2
diff --git a/llvm/test/CodeGen/X86/field-extract-use-trunc.ll b/llvm/test/CodeGen/X86/field-extract-use-trunc.ll
index b9721d24910549..6d3a303da14c20 100644
--- a/llvm/test/CodeGen/X86/field-extract-use-trunc.ll
+++ b/llvm/test/CodeGen/X86/field-extract-use-trunc.ll
@@ -63,7 +63,9 @@ define i64 @test4(i64 %f12) nounwind {
;
; x86_64-LABEL: test4:
; x86_64: # %bb.0:
-; x86_64-NEXT: movslq %edi, %rax
+; x86_64-NEXT: movq %rdi, %rax
+; x86_64-NEXT: shlq $32, %rax
+; x86_64-NEXT: sarq $32, %rax
; x86_64-NEXT: retq
%f11 = shl i64 %f12, 32
%tmp7.25 = ashr i64 %f11, 32
@@ -93,13 +95,17 @@ define i16 @test5(i16 %f12) nounwind {
define i16 @test6(i16 %f12) nounwind {
; i686-LABEL: test6:
; i686: # %bb.0:
-; i686-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: shll $8, %eax
+; i686-NEXT: sarw $8, %ax
; i686-NEXT: # kill: def $ax killed $ax killed $eax
; i686-NEXT: retl
;
; x86_64-LABEL: test6:
; x86_64: # %bb.0:
-; x86_64-NEXT: movsbl %dil, %eax
+; x86_64-NEXT: movl %edi, %eax
+; x86_64-NEXT: shll $8, %eax
+; x86_64-NEXT: sarw $8, %ax
; x86_64-NEXT: # kill: def $ax killed $ax killed $eax
; x86_64-NEXT: retq
%f11 = shl i16 %f12, 8
diff --git a/llvm/test/CodeGen/X86/flt-rounds.ll b/llvm/test/CodeGen/X86/flt-rounds.ll
index a5908978a5438a..2ea4bb55a5bed7 100644
--- a/llvm/test/CodeGen/X86/flt-rounds.ll
+++ b/llvm/test/CodeGen/X86/flt-rounds.ll
@@ -11,8 +11,8 @@ define i32 @test_flt_rounds() nounwind {
; X86-NEXT: subl $2, %esp
; X86-NEXT: fnstcw (%esp)
; X86-NEXT: movzwl (%esp), %ecx
-; X86-NEXT: shrl $9, %ecx
-; X86-NEXT: andb $6, %cl
+; X86-NEXT: andl $3072, %ecx # imm = 0xC00
+; X86-NEXT: shrw $9, %cx
; X86-NEXT: movl $45, %eax
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shrl %cl, %eax
@@ -24,8 +24,8 @@ define i32 @test_flt_rounds() nounwind {
; X64: # %bb.0:
; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp)
; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT: shrl $9, %ecx
-; X64-NEXT: andb $6, %cl
+; X64-NEXT: andl $3072, %ecx # imm = 0xC00
+; X64-NEXT: shrw $9, %cx
; X64-NEXT: movl $45, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrl %cl, %eax
@@ -46,8 +46,8 @@ define i32 @multiple_flt_rounds() nounwind {
; X86-NEXT: calll fesetround
; X86-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shrl $9, %ecx
-; X86-NEXT: andb $6, %cl
+; X86-NEXT: andl $3072, %ecx # imm = 0xC00
+; X86-NEXT: shrw $9, %cx
; X86-NEXT: movl $45, %esi
; X86-NEXT: movl $45, %eax
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -60,8 +60,8 @@ define i32 @multiple_flt_rounds() nounwind {
; X86-NEXT: calll fesetround
; X86-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shrl $9, %ecx
-; X86-NEXT: andb $6, %cl
+; X86-NEXT: andl $3072, %ecx # imm = 0xC00
+; X86-NEXT: shrw $9, %cx
; X86-NEXT: movl $45, %eax
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shrl %cl, %eax
@@ -75,8 +75,8 @@ define i32 @multiple_flt_rounds() nounwind {
; X86-NEXT: calll fesetround
; X86-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shrl $9, %ecx
-; X86-NEXT: andb $6, %cl
+; X86-NEXT: andl $3072, %ecx # imm = 0xC00
+; X86-NEXT: shrw $9, %cx
; X86-NEXT: movl $45, %eax
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shrl %cl, %eax
@@ -87,8 +87,8 @@ define i32 @multiple_flt_rounds() nounwind {
; X86-NEXT: calll fesetround
; X86-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shrl $9, %ecx
-; X86-NEXT: andb $6, %cl
+; X86-NEXT: andl $3072, %ecx # imm = 0xC00
+; X86-NEXT: shrw $9, %cx
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shrl %cl, %esi
; X86-NEXT: andl $3, %esi
@@ -114,8 +114,8 @@ define i32 @multiple_flt_rounds() nounwind {
; X64-NEXT: callq fesetround
; X64-NEXT: fnstcw {{[0-9]+}}(%rsp)
; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
-; X64-NEXT: shrl $9, %ecx
-; X64-NEXT: andb $6, %cl
+; X64-NEXT: andl $3072, %ecx # imm = 0xC00
+; X64-NEXT: shrw $9, %cx
; X64-NEXT: movl $45, %ebx
; X64-NEXT: movl $45, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -128,8 +128,8 @@ define i32 @multiple_flt_rounds() nounwind {
; X64-NEXT: callq fesetround
; X64-NEXT: fnstcw {{[0-9]+}}(%rsp)
; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
-; X64-NEXT: shrl $9, %ecx
-; X64-NEXT: andb $6, %cl
+; X64-NEXT: andl $3072, %ecx # imm = 0xC00
+; X64-NEXT: shrw $9, %cx
; X64-NEXT: movl $45, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrl %cl, %eax
@@ -141,8 +141,8 @@ define i32 @multiple_flt_rounds() nounwind {
; X64-NEXT: callq fesetround
; X64-NEXT: fnstcw {{[0-9]+}}(%rsp)
; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
-; X64-NEXT: shrl $9, %ecx
-; X64-NEXT: andb $6, %cl
+; X64-NEXT: andl $3072, %ecx # imm = 0xC00
+; X64-NEXT: shrw $9, %cx
; X64-NEXT: movl $45, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrl %cl, %eax
@@ -153,8 +153,8 @@ define i32 @multiple_flt_rounds() nounwind {
; X64-NEXT: callq fesetround
; X64-NEXT: fnstcw {{[0-9]+}}(%rsp)
; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
-; X64-NEXT: shrl $9, %ecx
-; X64-NEXT: andb $6, %cl
+; X64-NEXT: andl $3072, %ecx # imm = 0xC00
+; X64-NEXT: shrw $9, %cx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrl %cl, %ebx
; X64-NEXT: andl $3, %ebx
diff --git a/llvm/test/CodeGen/X86/fold-and-shift.ll b/llvm/test/CodeGen/X86/fold-and-shift.ll
index 985d7c6c82f06d..f7e9cad907aa13 100644
--- a/llvm/test/CodeGen/X86/fold-and-shift.ll
+++ b/llvm/test/CodeGen/X86/fold-and-shift.ll
@@ -30,7 +30,8 @@ define i32 @t2(ptr %X, i32 %i) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl %cx, %ecx
-; X86-NEXT: movl (%eax,%ecx,4), %eax
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: movl (%eax,%ecx,2), %eax
; X86-NEXT: retl
;
; X64-LABEL: t2:
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 96b2e1ef982765..a1b69d0993ecee 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1336,8 +1336,9 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: andl $31, %edi
+; CHECK-SSE-NEXT: addl $3, %edi
; CHECK-SSE-NEXT: shll $23, %edi
-; CHECK-SSE-NEXT: movl $-1115684864, %eax # imm = 0xBD800000
+; CHECK-SSE-NEXT: movl $-1090519040, %eax # imm = 0xBF000000
; CHECK-SSE-NEXT: subl %edi, %eax
; CHECK-SSE-NEXT: movd %eax, %xmm0
; CHECK-SSE-NEXT: retq
@@ -1345,8 +1346,9 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; CHECK-AVX-LABEL: fdiv_pow_shl_cnt:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: andl $31, %edi
+; CHECK-AVX-NEXT: addl $3, %edi
; CHECK-AVX-NEXT: shll $23, %edi
-; CHECK-AVX-NEXT: movl $-1115684864, %eax # imm = 0xBD800000
+; CHECK-AVX-NEXT: movl $-1090519040, %eax # imm = 0xBF000000
; CHECK-AVX-NEXT: subl %edi, %eax
; CHECK-AVX-NEXT: vmovd %eax, %xmm0
; CHECK-AVX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index 9121cf2d654a39..3518d9f0be173a 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -757,6 +757,7 @@ define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shlb %cl, %al
+; X86-NEXT: andb $-128, %al
; X86-NEXT: shrb $7, %al
; X86-NEXT: retl
;
@@ -766,6 +767,7 @@ define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shlb %cl, %al
+; X64-NEXT: andb $-128, %al
; X64-NEXT: shrb $7, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 1a2aac657d30fb..07c0cb960c8001 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -676,6 +676,7 @@ define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shrb %cl, %al
+; X86-NEXT: andb $-128, %al
; X86-NEXT: shrb $7, %al
; X86-NEXT: retl
;
@@ -685,6 +686,7 @@ define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrb %cl, %al
+; X64-NEXT: andb $-128, %al
; X64-NEXT: shrb $7, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index 7fb07c6b3163e7..ef148b4963b1e2 100644
--- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -38,7 +38,7 @@ define void @i24_and_or(ptr %a) {
; X86-NEXT: shll $16, %edx
; X86-NEXT: orl %ecx, %edx
; X86-NEXT: orl $384, %edx # imm = 0x180
-; X86-NEXT: andl $-128, %edx
+; X86-NEXT: andl $16777088, %edx # imm = 0xFFFF80
; X86-NEXT: movw %dx, (%eax)
; X86-NEXT: retl
;
@@ -49,7 +49,7 @@ define void @i24_and_or(ptr %a) {
; X64-NEXT: shll $16, %ecx
; X64-NEXT: orl %eax, %ecx
; X64-NEXT: orl $384, %ecx # imm = 0x180
-; X64-NEXT: andl $-128, %ecx
+; X64-NEXT: andl $16777088, %ecx # imm = 0xFFFF80
; X64-NEXT: movw %cx, (%rdi)
; X64-NEXT: retq
%b = load i24, ptr %a, align 1
@@ -108,6 +108,13 @@ define void @i56_or(ptr %a) {
;
; X64-LABEL: i56_or:
; X64: # %bb.0:
+; X64-NEXT: movzwl 4(%rdi), %eax
+; X64-NEXT: movzbl 6(%rdi), %ecx
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: shlq $32, %rcx
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: movw %cx, 4(%rdi)
; X64-NEXT: orl $384, (%rdi) # imm = 0x180
; X64-NEXT: retq
%aa = load i56, ptr %a, align 1
diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll
index 16946caf9a328f..c0c6ede92f8628 100644
--- a/llvm/test/CodeGen/X86/insertelement-var-index.ll
+++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll
@@ -2285,7 +2285,8 @@ define i32 @PR44139(ptr %p) {
; SSE-NEXT: leal 2147483647(%rax), %ecx
; SSE-NEXT: testl %eax, %eax
; SSE-NEXT: cmovnsl %eax, %ecx
-; SSE-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
+; SSE-NEXT: shrl $31, %ecx
+; SSE-NEXT: shll $31, %ecx
; SSE-NEXT: addl %eax, %ecx
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: xorl %edx, %edx
@@ -2305,7 +2306,8 @@ define i32 @PR44139(ptr %p) {
; AVX1-NEXT: leal 2147483647(%rax), %ecx
; AVX1-NEXT: testl %eax, %eax
; AVX1-NEXT: cmovnsl %eax, %ecx
-; AVX1-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
+; AVX1-NEXT: shrl $31, %ecx
+; AVX1-NEXT: shll $31, %ecx
; AVX1-NEXT: addl %eax, %ecx
; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
; AVX1-NEXT: xorl %edx, %edx
@@ -2326,7 +2328,8 @@ define i32 @PR44139(ptr %p) {
; AVX2-NEXT: leal 2147483647(%rax), %ecx
; AVX2-NEXT: testl %eax, %eax
; AVX2-NEXT: cmovnsl %eax, %ecx
-; AVX2-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
+; AVX2-NEXT: shrl $31, %ecx
+; AVX2-NEXT: shll $31, %ecx
; AVX2-NEXT: addl %eax, %ecx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: xorl %edx, %edx
@@ -2345,7 +2348,8 @@ define i32 @PR44139(ptr %p) {
; AVX512-NEXT: leal 2147483647(%rax), %ecx
; AVX512-NEXT: testl %eax, %eax
; AVX512-NEXT: cmovnsl %eax, %ecx
-; AVX512-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
+; AVX512-NEXT: shrl $31, %ecx
+; AVX512-NEXT: shll $31, %ecx
; AVX512-NEXT: addl %eax, %ecx
; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512-NEXT: xorl %edx, %edx
@@ -2367,7 +2371,8 @@ define i32 @PR44139(ptr %p) {
; X86AVX2-NEXT: leal 2147483647(%eax), %ecx
; X86AVX2-NEXT: testl %eax, %eax
; X86AVX2-NEXT: cmovnsl %eax, %ecx
-; X86AVX2-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
+; X86AVX2-NEXT: shrl $31, %ecx
+; X86AVX2-NEXT: shll $31, %ecx
; X86AVX2-NEXT: addl %eax, %ecx
; X86AVX2-NEXT: xorl %edx, %edx
; X86AVX2-NEXT: divl %ecx
diff --git a/llvm/test/CodeGen/X86/int-to-fp-demanded.ll b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
index cdde03fb0534b9..c30a52baede9f9 100644
--- a/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
+++ b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
@@ -21,6 +21,7 @@ define i32 @sitofp_signbit_only(i32 %i_in) nounwind {
; X64: # %bb.0:
; X64-NEXT: cvtsi2ss %edi, %xmm0
; X64-NEXT: movmskps %xmm0, %eax
+; X64-NEXT: andl $1, %eax
; X64-NEXT: shll $31, %eax
; X64-NEXT: retq
%f = sitofp i32 %i_in to float
@@ -47,6 +48,7 @@ define i32 @sitofp_signbit_only_okay_width(i16 %i_in) nounwind {
; X64-NEXT: movswl %di, %eax
; X64-NEXT: cvtsi2ss %eax, %xmm0
; X64-NEXT: movmskps %xmm0, %eax
+; X64-NEXT: andl $1, %eax
; X64-NEXT: shll $31, %eax
; X64-NEXT: retq
%f = sitofp i16 %i_in to float
@@ -70,6 +72,7 @@ define i32 @sitofp_signbit_only_fail_bad_width1(i64 %i_in) nounwind {
; X64: # %bb.0:
; X64-NEXT: cvtsi2ss %rdi, %xmm0
; X64-NEXT: movmskps %xmm0, %eax
+; X64-NEXT: andl $1, %eax
; X64-NEXT: shll $31, %eax
; X64-NEXT: retq
%f = sitofp i64 %i_in to float
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 2046d790cc57e4..b9dc63a20f4a18 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -1474,13 +1474,12 @@ define <4 x i1> @isnan_v4f(<4 x float> %x) {
; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax
; CHECK-32-NEXT: sahf
; CHECK-32-NEXT: setp %dh
-; CHECK-32-NEXT: shlb $2, %dh
; CHECK-32-NEXT: fucomp %st(0)
; CHECK-32-NEXT: fnstsw %ax
; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax
; CHECK-32-NEXT: sahf
; CHECK-32-NEXT: setp %dl
-; CHECK-32-NEXT: shlb $3, %dl
+; CHECK-32-NEXT: addb %dl, %dl
; CHECK-32-NEXT: orb %dh, %dl
; CHECK-32-NEXT: fucomp %st(0)
; CHECK-32-NEXT: fnstsw %ax
@@ -1494,6 +1493,7 @@ define <4 x i1> @isnan_v4f(<4 x float> %x) {
; CHECK-32-NEXT: setp %al
; CHECK-32-NEXT: addb %al, %al
; CHECK-32-NEXT: orb %dh, %al
+; CHECK-32-NEXT: shlb $2, %al
; CHECK-32-NEXT: orb %dl, %al
; CHECK-32-NEXT: movb %al, (%ecx)
; CHECK-32-NEXT: movl %ecx, %eax
@@ -1520,12 +1520,11 @@ define <4 x i1> @isnan_v4f_strictfp(<4 x float> %x) strictfp {
; CHECK-32-NEXT: andl %ecx, %edx
; CHECK-32-NEXT: cmpl $2139095041, %edx # imm = 0x7F800001
; CHECK-32-NEXT: setge %dh
-; CHECK-32-NEXT: shlb $2, %dh
; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi
; CHECK-32-NEXT: andl %ecx, %esi
; CHECK-32-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001
; CHECK-32-NEXT: setge %dl
-; CHECK-32-NEXT: shlb $3, %dl
+; CHECK-32-NEXT: addb %dl, %dl
; CHECK-32-NEXT: orb %dh, %dl
; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi
; CHECK-32-NEXT: andl %ecx, %esi
@@ -1536,6 +1535,7 @@ define <4 x i1> @isnan_v4f_strictfp(<4 x float> %x) strictfp {
; CHECK-32-NEXT: setge %cl
; CHECK-32-NEXT: addb %cl, %cl
; CHECK-32-NEXT: orb %dh, %cl
+; CHECK-32-NEXT: shlb $2, %cl
; CHECK-32-NEXT: orb %dl, %cl
; CHECK-32-NEXT: movb %cl, (%eax)
; CHECK-32-NEXT: popl %esi
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index e183bbc15617d5..34ed562427e1dd 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -103,8 +103,8 @@ define i1 @pow2_srl(i32 %x, i32 %y) {
; CHECK-NEXT: andb $7, %cl
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: shll %cl, %eax
+; CHECK-NEXT: andl $1048576, %eax # imm = 0x100000
; CHECK-NEXT: shrl $20, %eax
-; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%yy = and i32 %y, 7
diff --git a/llvm/test/CodeGen/X86/lea-dagdag.ll b/llvm/test/CodeGen/X86/lea-dagdag.ll
index f81851a92d8de0..6abdbd53d08ad7 100644
--- a/llvm/test/CodeGen/X86/lea-dagdag.ll
+++ b/llvm/test/CodeGen/X86/lea-dagdag.ll
@@ -6,9 +6,9 @@ define i16 @and_i8_zext_shl_add_i16(i16 %t0, i8 %t1) {
; CHECK-LABEL: and_i8_zext_shl_add_i16:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: andl $8, %esi
-; CHECK-NEXT: leal (%rdi,%rsi,4), %eax
+; CHECK-NEXT: leal (,%rsi,4), %eax
+; CHECK-NEXT: addl %edi, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: retq
%t4 = and i8 %t1, 8
@@ -82,10 +82,10 @@ define i32 @and_i16_zext_shl_add_i32(i32 %t0, i16 %t1) {
define i32 @and_i16_shl_zext_add_i32(i32 %t0, i16 %t1) {
; CHECK-LABEL: and_i16_shl_zext_add_i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: andl $8, %esi
-; CHECK-NEXT: leal (%rdi,%rsi,4), %eax
+; CHECK-NEXT: movzwl %si, %eax
+; CHECK-NEXT: leal (%rdi,%rax,4), %eax
; CHECK-NEXT: retq
%t4 = and i16 %t1, 8
%sh = shl i16 %t4, 2
diff --git a/llvm/test/CodeGen/X86/lea.ll b/llvm/test/CodeGen/X86/lea.ll
index 33d121f6849ba5..074c8f28b91a7c 100644
--- a/llvm/test/CodeGen/X86/lea.ll
+++ b/llvm/test/CodeGen/X86/lea.ll
@@ -32,7 +32,8 @@ define i32 @test2(i32 %x_offs) nounwind readnone {
; LINUX-NEXT: jl .LBB1_2
; LINUX-NEXT: # %bb.1: # %bb.nph
; LINUX-NEXT: leal -5(%rdi), %eax
-; LINUX-NEXT: andl $-4, %eax
+; LINUX-NEXT: shrl $2, %eax
+; LINUX-NEXT: shll $2, %eax
; LINUX-NEXT: negl %eax
; LINUX-NEXT: leal -4(%rdi,%rax), %eax
; LINUX-NEXT: retq
@@ -47,7 +48,8 @@ define i32 @test2(i32 %x_offs) nounwind readnone {
; WIN-NEXT: jl .LBB1_2
; WIN-NEXT: # %bb.1: # %bb.nph
; WIN-NEXT: leal -5(%rcx), %eax
-; WIN-NEXT: andl $-4, %eax
+; WIN-NEXT: shrl $2, %eax
+; WIN-NEXT: shll $2, %eax
; WIN-NEXT: negl %eax
; WIN-NEXT: leal -4(%rcx,%rax), %eax
; WIN-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll
index eb5d172a3b3527..cb264acc73c670 100644
--- a/llvm/test/CodeGen/X86/load-local-v3i129.ll
+++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll
@@ -5,26 +5,49 @@
define void @_start() nounwind {
; FAST-SHLD-LABEL: _start:
; FAST-SHLD: # %bb.0: # %Entry
-; FAST-SHLD-NEXT: movq -40(%rsp), %rax
-; FAST-SHLD-NEXT: movq -32(%rsp), %rcx
-; FAST-SHLD-NEXT: movq %rcx, %rdx
-; FAST-SHLD-NEXT: shlq $62, %rdx
-; FAST-SHLD-NEXT: shrq $2, %rcx
-; FAST-SHLD-NEXT: shldq $2, %rdx, %rcx
-; FAST-SHLD-NEXT: andq $-4, %rax
-; FAST-SHLD-NEXT: incq %rax
-; FAST-SHLD-NEXT: movq %rax, -40(%rsp)
-; FAST-SHLD-NEXT: movq %rcx, -32(%rsp)
+; FAST-SHLD-NEXT: movl -24(%rsp), %eax
+; FAST-SHLD-NEXT: movq %rax, %rcx
+; FAST-SHLD-NEXT: shlq $62, %rcx
+; FAST-SHLD-NEXT: shrl $2, %eax
+; FAST-SHLD-NEXT: movq -40(%rsp), %rdx
+; FAST-SHLD-NEXT: movq -32(%rsp), %rsi
+; FAST-SHLD-NEXT: movq %rsi, %rdi
+; FAST-SHLD-NEXT: shlq $62, %rdi
+; FAST-SHLD-NEXT: shrq $2, %rsi
+; FAST-SHLD-NEXT: shldq $2, %rdi, %rsi
+; FAST-SHLD-NEXT: shrq $2, %rdx
+; FAST-SHLD-NEXT: leaq 1(,%rdx,4), %rdx
+; FAST-SHLD-NEXT: movq %rdx, -40(%rsp)
+; FAST-SHLD-NEXT: movq %rsi, -32(%rsp)
+; FAST-SHLD-NEXT: shrq $62, %rcx
+; FAST-SHLD-NEXT: leal (%rcx,%rax,4), %eax
+; FAST-SHLD-NEXT: andl $7, %eax
+; FAST-SHLD-NEXT: movb %al, -24(%rsp)
; FAST-SHLD-NEXT: orq $-2, -56(%rsp)
; FAST-SHLD-NEXT: movq $-1, -48(%rsp)
; FAST-SHLD-NEXT: retq
;
; SLOW-SHLD-LABEL: _start:
; SLOW-SHLD: # %bb.0: # %Entry
-; SLOW-SHLD-NEXT: movq -40(%rsp), %rax
-; SLOW-SHLD-NEXT: andq $-4, %rax
-; SLOW-SHLD-NEXT: incq %rax
-; SLOW-SHLD-NEXT: movq %rax, -40(%rsp)
+; SLOW-SHLD-NEXT: movl -24(%rsp), %eax
+; SLOW-SHLD-NEXT: movq %rax, %rcx
+; SLOW-SHLD-NEXT: shlq $62, %rcx
+; SLOW-SHLD-NEXT: shrl $2, %eax
+; SLOW-SHLD-NEXT: movq -40(%rsp), %rdx
+; SLOW-SHLD-NEXT: movq -32(%rsp), %rsi
+; SLOW-SHLD-NEXT: movq %rsi, %rdi
+; SLOW-SHLD-NEXT: shrq $2, %rdi
+; SLOW-SHLD-NEXT: shlq $62, %rsi
+; SLOW-SHLD-NEXT: shrq $2, %rdx
+; SLOW-SHLD-NEXT: leaq 1(,%rdx,4), %rdx
+; SLOW-SHLD-NEXT: movq %rdx, -40(%rsp)
+; SLOW-SHLD-NEXT: shrq $62, %rsi
+; SLOW-SHLD-NEXT: leaq (%rsi,%rdi,4), %rdx
+; SLOW-SHLD-NEXT: movq %rdx, -32(%rsp)
+; SLOW-SHLD-NEXT: shrq $62, %rcx
+; SLOW-SHLD-NEXT: leal (%rcx,%rax,4), %eax
+; SLOW-SHLD-NEXT: andl $7, %eax
+; SLOW-SHLD-NEXT: movb %al, -24(%rsp)
; SLOW-SHLD-NEXT: orq $-2, -56(%rsp)
; SLOW-SHLD-NEXT: movq $-1, -48(%rsp)
; SLOW-SHLD-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/load-local-v4i5.ll b/llvm/test/CodeGen/X86/load-local-v4i5.ll
index 1d119b1dfefc28..2b5ed901842373 100644
--- a/llvm/test/CodeGen/X86/load-local-v4i5.ll
+++ b/llvm/test/CodeGen/X86/load-local-v4i5.ll
@@ -11,9 +11,6 @@ define void @_start() {
; CHECK-NEXT: movzbl -9(%rsp), %ecx
; CHECK-NEXT: movzbl -10(%rsp), %edx
; CHECK-NEXT: movzbl -11(%rsp), %esi
-; CHECK-NEXT: movzbl %cl, %edi
-; CHECK-NEXT: shrb %cl
-; CHECK-NEXT: movb %cl, -2(%rsp)
; CHECK-NEXT: andl $31, %eax
; CHECK-NEXT: andl $31, %esi
; CHECK-NEXT: shll $5, %esi
@@ -21,12 +18,16 @@ define void @_start() {
; CHECK-NEXT: andl $31, %edx
; CHECK-NEXT: shll $10, %edx
; CHECK-NEXT: orl %esi, %edx
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: shll $15, %eax
-; CHECK-NEXT: orl %edx, %eax
-; CHECK-NEXT: movw %ax, -4(%rsp)
-; CHECK-NEXT: movb %dil, -5(%rsp)
-; CHECK-NEXT: cmpb $31, %dil
+; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shll $15, %ecx
+; CHECK-NEXT: orl %edx, %ecx
+; CHECK-NEXT: movw %cx, -4(%rsp)
+; CHECK-NEXT: andl $983040, %ecx # imm = 0xF0000
+; CHECK-NEXT: shrl $16, %ecx
+; CHECK-NEXT: movb %cl, -2(%rsp)
+; CHECK-NEXT: movb %al, -5(%rsp)
+; CHECK-NEXT: cmpb $31, %al
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1: # %Then
; CHECK-NEXT: int3
diff --git a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
index e48618ba7a53d3..36ed8622b3d92a 100644
--- a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
+++ b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
@@ -31,7 +31,8 @@ define dso_local i32 @test(ptr %secret, i32 %secret_size) #0 {
; X64-ALL-NEXT: movl %eax, %ecx
; X64-ALL-NEXT: shrl $31, %ecx
; X64-ALL-NEXT: addl %eax, %ecx
-; X64-ALL-NEXT: andl $-2, %ecx
+; X64-ALL-NEXT: sarl %ecx
+; X64-ALL-NEXT: addl %ecx, %ecx
; X64-ALL-NEXT: cmpl %ecx, %eax
; X64-ALL-NEXT: jne .LBB0_4
; X64-ALL-NEXT: # %bb.3: # %if.then
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 253f990f8735ee..e1b74f40013769 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -3654,10 +3654,10 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: shrl $15, %ecx
; SSE-NEXT: movl %eax, %edx
-; SSE-NEXT: shrl $8, %edx
-; SSE-NEXT: andl $1, %edx
-; SSE-NEXT: andl $8, %eax
-; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: andl $8, %edx
+; SSE-NEXT: shrw $3, %dx
+; SSE-NEXT: andl $256, %eax # imm = 0x100
+; SSE-NEXT: movzbl %ah, %eax
; SSE-NEXT: xorl %edx, %eax
; SSE-NEXT: andl %ecx, %eax
; SSE-NEXT: # kill: def $al killed $al killed $eax
@@ -3670,10 +3670,10 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) {
; AVX1OR2-NEXT: movl %eax, %ecx
; AVX1OR2-NEXT: shrl $15, %ecx
; AVX1OR2-NEXT: movl %eax, %edx
-; AVX1OR2-NEXT: shrl $8, %edx
-; AVX1OR2-NEXT: andl $1, %edx
-; AVX1OR2-NEXT: andl $8, %eax
-; AVX1OR2-NEXT: shrl $3, %eax
+; AVX1OR2-NEXT: andl $8, %edx
+; AVX1OR2-NEXT: shrw $3, %dx
+; AVX1OR2-NEXT: andl $256, %eax # imm = 0x100
+; AVX1OR2-NEXT: movzbl %ah, %eax
; AVX1OR2-NEXT: xorl %edx, %eax
; AVX1OR2-NEXT: andl %ecx, %eax
; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/movmsk.ll b/llvm/test/CodeGen/X86/movmsk.ll
index 685c07454c5e10..35d96325338ada 100644
--- a/llvm/test/CodeGen/X86/movmsk.ll
+++ b/llvm/test/CodeGen/X86/movmsk.ll
@@ -10,7 +10,9 @@ define i32 @double_signbit(double %d1) nounwind uwtable readnone ssp {
; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movmskpd %xmm0, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: shlq $63, %rax
+; CHECK-NEXT: shrq $63, %rax
+; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca double, align 8
@@ -30,7 +32,9 @@ define i32 @double_add_signbit(double %d1, double %d2) nounwind uwtable readnone
; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movmskpd %xmm0, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: shlq $63, %rax
+; CHECK-NEXT: shrq $63, %rax
+; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca double, align 8
@@ -51,6 +55,8 @@ define i32 @float_signbit(float %f1) nounwind uwtable readnone ssp {
; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movmskps %xmm0, %eax
; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: shll $31, %eax
+; CHECK-NEXT: shrl $31, %eax
; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca float, align 4
@@ -70,6 +76,8 @@ define i32 @float_add_signbit(float %f1, float %f2) nounwind uwtable readnone ss
; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movmskps %xmm0, %eax
; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: shll $31, %eax
+; CHECK-NEXT: shrl $31, %eax
; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca float, align 4
@@ -87,7 +95,9 @@ define void @float_call_signbit(double %n) {
; CHECK-LABEL: float_call_signbit:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movmskpd %xmm0, %edi
-; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: shlq $63, %rdi
+; CHECK-NEXT: shrq $63, %rdi
+; CHECK-NEXT: ## kill: def $edi killed $edi killed $rdi
; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL
entry:
%t0 = bitcast double %n to i64
diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll
index 384e40496d82a6..b309c5b1c286a3 100644
--- a/llvm/test/CodeGen/X86/packus.ll
+++ b/llvm/test/CodeGen/X86/packus.ll
@@ -120,51 +120,63 @@ define <8 x i16> @trunc_lshr_v8i32(<8 x i32> %a) nounwind {
define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) {
; SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
; SSE2: # %bb.0:
+; SSE2-NEXT: psllq $63, %xmm1
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrlq $63, %xmm0
+; SSE2-NEXT: psrlq $63, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
; SSE4: # %bb.0:
+; SSE4-NEXT: psllq $63, %xmm1
+; SSE4-NEXT: psllq $63, %xmm0
+; SSE4-NEXT: psrlq $63, %xmm0
+; SSE4-NEXT: psrlq $63, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE4-NEXT: pmovsxbd {{.*#+}} xmm2 = [1,1,1,1]
-; SSE4-NEXT: pand %xmm2, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE4-NEXT: pand %xmm2, %xmm0
; SSE4-NEXT: packusdw %xmm1, %xmm0
; SSE4-NEXT: ret{{[l|q]}}
;
-; X86-AVX1-LABEL: trunc_lshr_v4i64_demandedelts:
-; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vzeroupper
-; X86-AVX1-NEXT: retl
+; AVX1-LABEL: trunc_lshr_v4i64_demandedelts:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: ret{{[l|q]}}
;
-; X64-AVX1-LABEL: trunc_lshr_v4i64_demandedelts:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vzeroupper
-; X64-AVX1-NEXT: retq
+; X86-AVX2-LABEL: trunc_lshr_v4i64_demandedelts:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0]
+; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1]
+; X86-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
;
-; AVX2-LABEL: trunc_lshr_v4i64_demandedelts:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: ret{{[l|q]}}
+; X64-AVX2-LABEL: trunc_lshr_v4i64_demandedelts:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [63,0,63,0]
+; X64-AVX2-NEXT: # ymm1 = mem[0,1,0,1]
+; X64-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
%1 = shl <4 x i64> %a0, <i64 63, i64 0, i64 63, i64 0>
%2 = lshr <4 x i64> %1, <i64 63, i64 0, i64 63, i64 0>
%3 = bitcast <4 x i64> %2 to <8 x i32>
@@ -446,9 +458,7 @@ define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) {
ret <32 x i8> %4
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; X64-AVX2: {{.*}}
; X64-SSE2: {{.*}}
; X64-SSE4: {{.*}}
-; X86-AVX2: {{.*}}
; X86-SSE2: {{.*}}
; X86-SSE4: {{.*}}
diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll
index 420f5ba5ab4336..ac8b2f8408679f 100644
--- a/llvm/test/CodeGen/X86/parity.ll
+++ b/llvm/test/CodeGen/X86/parity.ll
@@ -408,7 +408,7 @@ define i16 @parity_16_shift(i16 %0) {
; X86-NOPOPCNT-NEXT: xorl %eax, %eax
; X86-NOPOPCNT-NEXT: xorb %ch, %cl
; X86-NOPOPCNT-NEXT: setnp %al
-; X86-NOPOPCNT-NEXT: addl %eax, %eax
+; X86-NOPOPCNT-NEXT: addw %ax, %ax
; X86-NOPOPCNT-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NOPOPCNT-NEXT: retl
;
@@ -418,7 +418,7 @@ define i16 @parity_16_shift(i16 %0) {
; X64-NOPOPCNT-NEXT: xorl %eax, %eax
; X64-NOPOPCNT-NEXT: xorb %ch, %cl
; X64-NOPOPCNT-NEXT: setnp %al
-; X64-NOPOPCNT-NEXT: addl %eax, %eax
+; X64-NOPOPCNT-NEXT: addw %ax, %ax
; X64-NOPOPCNT-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NOPOPCNT-NEXT: retq
;
@@ -427,7 +427,7 @@ define i16 @parity_16_shift(i16 %0) {
; X86-POPCNT-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-POPCNT-NEXT: popcntl %eax, %eax
; X86-POPCNT-NEXT: andl $1, %eax
-; X86-POPCNT-NEXT: addl %eax, %eax
+; X86-POPCNT-NEXT: addw %ax, %ax
; X86-POPCNT-NEXT: # kill: def $ax killed $ax killed $eax
; X86-POPCNT-NEXT: retl
;
@@ -436,7 +436,7 @@ define i16 @parity_16_shift(i16 %0) {
; X64-POPCNT-NEXT: movzwl %di, %eax
; X64-POPCNT-NEXT: popcntl %eax, %eax
; X64-POPCNT-NEXT: andl $1, %eax
-; X64-POPCNT-NEXT: addl %eax, %eax
+; X64-POPCNT-NEXT: addw %ax, %ax
; X64-POPCNT-NEXT: # kill: def $ax killed $ax killed $eax
; X64-POPCNT-NEXT: retq
%2 = tail call i16 @llvm.ctpop.i16(i16 %0)
@@ -637,7 +637,7 @@ define i64 @parity_64_shift(i64 %0) {
; X64-NOPOPCNT-NEXT: xorl %eax, %eax
; X64-NOPOPCNT-NEXT: xorb %ch, %cl
; X64-NOPOPCNT-NEXT: setnp %al
-; X64-NOPOPCNT-NEXT: addl %eax, %eax
+; X64-NOPOPCNT-NEXT: addq %rax, %rax
; X64-NOPOPCNT-NEXT: retq
;
; X86-POPCNT-LABEL: parity_64_shift:
@@ -654,7 +654,7 @@ define i64 @parity_64_shift(i64 %0) {
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntq %rdi, %rax
; X64-POPCNT-NEXT: andl $1, %eax
-; X64-POPCNT-NEXT: addl %eax, %eax
+; X64-POPCNT-NEXT: addq %rax, %rax
; X64-POPCNT-NEXT: retq
%2 = tail call i64 @llvm.ctpop.i64(i64 %0)
%3 = shl nuw nsw i64 %2, 1
diff --git a/llvm/test/CodeGen/X86/pr22970.ll b/llvm/test/CodeGen/X86/pr22970.ll
index 28aea0f2af58c0..20fc5d2e066ae3 100644
--- a/llvm/test/CodeGen/X86/pr22970.ll
+++ b/llvm/test/CodeGen/X86/pr22970.ll
@@ -37,7 +37,8 @@ define i32 @PR22970_i64(ptr nocapture readonly, i64) {
; X64-LABEL: PR22970_i64:
; X64: # %bb.0:
; X64-NEXT: andl $4095, %esi # imm = 0xFFF
-; X64-NEXT: movl 32(%rdi,%rsi,4), %eax
+; X64-NEXT: addl $8, %esi
+; X64-NEXT: movl (%rdi,%rsi,4), %eax
; X64-NEXT: retq
%3 = and i64 %1, 4095
%4 = add nuw nsw i64 %3, 8
diff --git a/llvm/test/CodeGen/X86/pr29170.ll b/llvm/test/CodeGen/X86/pr29170.ll
index a27238a8e4f18f..067ab693243c0e 100644
--- a/llvm/test/CodeGen/X86/pr29170.ll
+++ b/llvm/test/CodeGen/X86/pr29170.ll
@@ -14,7 +14,9 @@ define i32 @main() {
; CHECK-NEXT: jne .LBB0_3
; CHECK-NEXT: # %bb.1: # %go
; CHECK-NEXT: movl $-1, %ecx
-; CHECK-NEXT: movsbl b, %edx
+; CHECK-NEXT: movl b, %edx
+; CHECK-NEXT: shll $8, %edx
+; CHECK-NEXT: sarw $8, %dx
; CHECK-NEXT: notl %ecx
; CHECK-NEXT: movzwl %dx, %edx
; CHECK-NEXT: cmpl $-1, %edx
diff --git a/llvm/test/CodeGen/X86/pr32420.ll b/llvm/test/CodeGen/X86/pr32420.ll
index 52d42520900d89..d13853b498949d 100644
--- a/llvm/test/CodeGen/X86/pr32420.ll
+++ b/llvm/test/CodeGen/X86/pr32420.ll
@@ -14,14 +14,12 @@ define i32 @PR32420() {
; CHECK-NEXT: movzwl (%rcx), %eax
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: shll $12, %edx
-; CHECK-NEXT: movswl %dx, %edx
-; CHECK-NEXT: shrl $12, %edx
+; CHECK-NEXT: sarw $12, %dx
; CHECK-NEXT: movq _b at GOTPCREL(%rip), %rsi
; CHECK-NEXT: orw (%rsi), %dx
; CHECK-NEXT: movl (%rcx), %ecx
; CHECK-NEXT: shll $12, %ecx
-; CHECK-NEXT: movswl %cx, %ecx
-; CHECK-NEXT: shrl $12, %ecx
+; CHECK-NEXT: sarw $12, %cx
; CHECK-NEXT: andl %edx, %ecx
; CHECK-NEXT: movw %cx, (%rsi)
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr32588.ll b/llvm/test/CodeGen/X86/pr32588.ll
index 2ba2f899f7dd9f..ff7e02c9319a5f 100644
--- a/llvm/test/CodeGen/X86/pr32588.ll
+++ b/llvm/test/CodeGen/X86/pr32588.ll
@@ -8,10 +8,20 @@
define void @fn1() {
; CHECK-LABEL: fn1:
; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: cmpl $0, c(%rip)
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movl %eax, d(%rip)
+; CHECK-NEXT: cmpl $0, b(%rip)
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: cmpl $0, d(%rip)
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: andb %al, %cl
+; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shll $31, %ecx
+; CHECK-NEXT: shrl $31, %ecx
+; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: sarl %ecx
+; CHECK-NEXT: cmpl $1, c(%rip)
+; CHECK-NEXT: adcl $0, %ecx
+; CHECK-NEXT: movl %ecx, d(%rip)
; CHECK-NEXT: retq
%t0 = load i32, ptr @c, align 4
%tobool1 = icmp eq i32 %t0, 0
diff --git a/llvm/test/CodeGen/X86/pr45995.ll b/llvm/test/CodeGen/X86/pr45995.ll
index 997ad6be84b9d1..7a1b608ed21fd7 100644
--- a/llvm/test/CodeGen/X86/pr45995.ll
+++ b/llvm/test/CodeGen/X86/pr45995.ll
@@ -10,6 +10,7 @@ define void @extracter0([4 x <4 x i1>] %matrix) nounwind {
; CHECK-NEXT: vpslld xmm0, xmm0, 31
; CHECK-NEXT: vmovmskps edi, xmm0
; CHECK-NEXT: mov ebx, edi
+; CHECK-NEXT: and bl, 8
; CHECK-NEXT: shr bl, 3
; CHECK-NEXT: mov ebp, edi
; CHECK-NEXT: and bpl, 4
@@ -51,40 +52,42 @@ define void @extracter1([4 x <4 x i1>] %matrix) nounwind {
; CHECK-NEXT: push rbx
; CHECK-NEXT: push rax
; CHECK-NEXT: vpslld xmm1, xmm1, 31
-; CHECK-NEXT: vmovmskps ebx, xmm1
-; CHECK-NEXT: mov eax, ebx
-; CHECK-NEXT: shr al, 3
-; CHECK-NEXT: mov byte ptr [rsp + 7], al # 1-byte Spill
-; CHECK-NEXT: mov r14d, ebx
+; CHECK-NEXT: vmovmskps eax, xmm1
+; CHECK-NEXT: mov dword ptr [rsp + 4], eax # 4-byte Spill
+; CHECK-NEXT: mov ebp, eax
+; CHECK-NEXT: and bpl, 8
+; CHECK-NEXT: shr bpl, 3
+; CHECK-NEXT: mov r14d, eax
; CHECK-NEXT: and r14b, 4
; CHECK-NEXT: shr r14b, 2
-; CHECK-NEXT: mov r15d, ebx
+; CHECK-NEXT: mov r15d, eax
; CHECK-NEXT: and r15b, 2
; CHECK-NEXT: shr r15b
; CHECK-NEXT: vpslld xmm0, xmm0, 31
; CHECK-NEXT: vmovmskps edi, xmm0
; CHECK-NEXT: mov r12d, edi
+; CHECK-NEXT: and r12b, 8
; CHECK-NEXT: shr r12b, 3
; CHECK-NEXT: mov r13d, edi
; CHECK-NEXT: and r13b, 4
; CHECK-NEXT: shr r13b, 2
-; CHECK-NEXT: mov ebp, edi
-; CHECK-NEXT: and bpl, 2
-; CHECK-NEXT: shr bpl
+; CHECK-NEXT: mov ebx, edi
+; CHECK-NEXT: and bl, 2
+; CHECK-NEXT: shr bl
; CHECK-NEXT: call print_i1 at PLT
-; CHECK-NEXT: movzx edi, bpl
+; CHECK-NEXT: movzx edi, bl
; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: movzx edi, r13b
; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: movzx edi, r12b
; CHECK-NEXT: call print_i1 at PLT
-; CHECK-NEXT: mov edi, ebx
+; CHECK-NEXT: mov edi, dword ptr [rsp + 4] # 4-byte Reload
; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: movzx edi, r15b
; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: movzx edi, r14b
; CHECK-NEXT: call print_i1 at PLT
-; CHECK-NEXT: movzx edi, byte ptr [rsp + 7] # 1-byte Folded Reload
+; CHECK-NEXT: movzx edi, bpl
; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: add rsp, 8
; CHECK-NEXT: pop rbx
diff --git a/llvm/test/CodeGen/X86/pr61923.ll b/llvm/test/CodeGen/X86/pr61923.ll
index 576b029cd03297..53eb37be8ba5c4 100644
--- a/llvm/test/CodeGen/X86/pr61923.ll
+++ b/llvm/test/CodeGen/X86/pr61923.ll
@@ -12,7 +12,8 @@ define void @test_loop(ptr align 1 %src, ptr align 1 %dest, i32 %len) {
; CHECK-NEXT: jb .LBB0_4
; CHECK-NEXT: # %bb.1: # %memcmp.loop.preheader
; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: andl $-32, %eax
+; CHECK-NEXT: shrl $5, %eax
+; CHECK-NEXT: shlq $5, %rax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %memcmp.loop
diff --git a/llvm/test/CodeGen/X86/pr77459.ll b/llvm/test/CodeGen/X86/pr77459.ll
index 96f6a188193834..27a3c1311d0d37 100644
--- a/llvm/test/CodeGen/X86/pr77459.ll
+++ b/llvm/test/CodeGen/X86/pr77459.ll
@@ -120,20 +120,22 @@ define i16 @reverse_cmp_v16i1(<16 x i8> %a0, <16 x i8> %a1) {
; SSE2-NEXT: rolw $8, %ax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $3855, %ecx # imm = 0xF0F
-; SSE2-NEXT: shll $4, %ecx
+; SSE2-NEXT: shlw $4, %cx
; SSE2-NEXT: shrl $4, %eax
; SSE2-NEXT: andl $3855, %eax # imm = 0xF0F
; SSE2-NEXT: orl %ecx, %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $13107, %ecx # imm = 0x3333
+; SSE2-NEXT: shlw $2, %cx
; SSE2-NEXT: shrl $2, %eax
; SSE2-NEXT: andl $13107, %eax # imm = 0x3333
-; SSE2-NEXT: leal (%rax,%rcx,4), %eax
+; SSE2-NEXT: orl %ecx, %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $21845, %ecx # imm = 0x5555
+; SSE2-NEXT: addw %cx, %cx
; SSE2-NEXT: shrl %eax
; SSE2-NEXT: andl $21845, %eax # imm = 0x5555
-; SSE2-NEXT: leal (%rax,%rcx,2), %eax
+; SSE2-NEXT: orl %ecx, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
index 0a00a72a2dc947..69f3d0a0470c3a 100644
--- a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
+++ b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
@@ -15,9 +15,10 @@ define i32 @and_signbit_shl(i32 %x, ptr %dst) {
;
; X86-LABEL: and_signbit_shl:
; X86: # %bb.0:
-; X86-NEXT: movl 8(%esp), %ecx
; X86-NEXT: movzbl 6(%esp), %eax
-; X86-NEXT: shll $24, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: movl 8(%esp), %ecx
+; X86-NEXT: shll $8, %eax
; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: retl
%t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -36,9 +37,10 @@ define i32 @and_nosignbit_shl(i32 %x, ptr %dst) {
;
; X86-LABEL: and_nosignbit_shl:
; X86: # %bb.0:
-; X86-NEXT: movl 8(%esp), %ecx
; X86-NEXT: movzbl 6(%esp), %eax
-; X86-NEXT: shll $24, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: movl 8(%esp), %ecx
+; X86-NEXT: shll $8, %eax
; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: retl
%t0 = and i32 %x, 2147418112 ; 0x7FFF0000
@@ -51,17 +53,17 @@ define i32 @or_signbit_shl(i32 %x, ptr %dst) {
; X64-LABEL: or_signbit_shl:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: orl $-65536, %eax # imm = 0xFFFF0000
; X64-NEXT: shll $8, %eax
-; X64-NEXT: orl $-16777216, %eax # imm = 0xFF000000
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: or_signbit_shl:
; X86: # %bb.0:
; X86-NEXT: movl 8(%esp), %ecx
-; X86-NEXT: movl 4(%esp), %eax
+; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT: orl 4(%esp), %eax
; X86-NEXT: shll $8, %eax
-; X86-NEXT: orl $-16777216, %eax # imm = 0xFF000000
; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: retl
%t0 = or i32 %x, 4294901760 ; 0xFFFF0000
@@ -188,16 +190,17 @@ define i32 @and_signbit_lshr(i32 %x, ptr %dst) {
; X64-LABEL: and_signbit_lshr:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; X64-NEXT: shrl $8, %eax
-; X64-NEXT: andl $16776960, %eax # imm = 0xFFFF00
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: and_signbit_lshr:
; X86: # %bb.0:
-; X86-NEXT: movl 8(%esp), %ecx
; X86-NEXT: movzwl 6(%esp), %eax
-; X86-NEXT: shll $8, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: movl 8(%esp), %ecx
+; X86-NEXT: shrl $8, %eax
; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: retl
%t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -209,8 +212,8 @@ define i32 @and_nosignbit_lshr(i32 %x, ptr %dst) {
; X64-LABEL: and_nosignbit_lshr:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000
; X64-NEXT: shrl $8, %eax
-; X64-NEXT: andl $8388352, %eax # imm = 0x7FFF00
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
;
@@ -369,16 +372,17 @@ define i32 @and_signbit_ashr(i32 %x, ptr %dst) {
; X64-LABEL: and_signbit_ashr:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; X64-NEXT: sarl $8, %eax
-; X64-NEXT: andl $-256, %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: and_signbit_ashr:
; X86: # %bb.0:
+; X86-NEXT: movzwl 6(%esp), %eax
+; X86-NEXT: shll $16, %eax
; X86-NEXT: movl 8(%esp), %ecx
-; X86-NEXT: movswl 6(%esp), %eax
-; X86-NEXT: shll $8, %eax
+; X86-NEXT: sarl $8, %eax
; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: retl
%t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -390,8 +394,8 @@ define i32 @and_nosignbit_ashr(i32 %x, ptr %dst) {
; X64-LABEL: and_nosignbit_ashr:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $8, %eax
-; X64-NEXT: andl $8388352, %eax # imm = 0x7FFF00
+; X64-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000
+; X64-NEXT: sarl $8, %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
;
@@ -400,7 +404,7 @@ define i32 @and_nosignbit_ashr(i32 %x, ptr %dst) {
; X86-NEXT: movl 8(%esp), %ecx
; X86-NEXT: movl $2147418112, %eax # imm = 0x7FFF0000
; X86-NEXT: andl 4(%esp), %eax
-; X86-NEXT: shrl $8, %eax
+; X86-NEXT: sarl $8, %eax
; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: retl
%t0 = and i32 %x, 2147418112 ; 0x7FFF0000
diff --git a/llvm/test/CodeGen/X86/rem.ll b/llvm/test/CodeGen/X86/rem.ll
index 893b49f9a01791..5d01980eb91540 100644
--- a/llvm/test/CodeGen/X86/rem.ll
+++ b/llvm/test/CodeGen/X86/rem.ll
@@ -29,7 +29,8 @@ define i32 @test2(i32 %X) {
; CHECK-NEXT: leal 255(%eax), %ecx
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: cmovnsl %eax, %ecx
-; CHECK-NEXT: andl $-256, %ecx
+; CHECK-NEXT: sarl $8, %ecx
+; CHECK-NEXT: shll $8, %ecx
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: retl
%tmp1 = srem i32 %X, 256
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index b2b242fa29818f..042365daf87e19 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -499,76 +499,129 @@ define void @v1i16(ptr %px, ptr %py, ptr %pz) nounwind {
define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; SSE-LABEL: v16i4:
; SSE: # %bb.0:
+; SSE-NEXT: psllw $4, %xmm0
+; SSE-NEXT: psrlw $4, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [3855,3855,3855,3855,3855,3855,3855,3855]
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: psubb %xmm2, %xmm0
; SSE-NEXT: psllw $4, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: psrlw $4, %xmm1
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: pxor %xmm2, %xmm3
+; SSE-NEXT: psubb %xmm2, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE-NEXT: psllw $4, %xmm3
+; SSE-NEXT: pand %xmm1, %xmm3
; SSE-NEXT: psllw $4, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: paddsb %xmm1, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: paddsb %xmm3, %xmm0
; SSE-NEXT: psrlw $4, %xmm0
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: psubb %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: v16i4:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3855,3855,3855,3855,3855,3855,3855,3855]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i4:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v16i4:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v16i4:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpternlogq $108, %xmm2, %xmm3, %xmm0
+; AVX512BW-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX512BW-NEXT: vpternlogq $108, %xmm2, %xmm3, %xmm1
+; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm0
+; AVX512BW-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: retq
%z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
ret <16 x i4> %z
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index ce0b212aa4c26c..f8062332b26ce9 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -81,11 +81,10 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X64-NEXT: testl %edx, %edx
; X64-NEXT: setne %dl
; X64-NEXT: testb %cl, %dl
-; X64-NEXT: cmovel %eax, %edi
-; X64-NEXT: addl %edi, %edi
-; X64-NEXT: movswl %di, %eax
-; X64-NEXT: shrl %eax
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: cmovnel %edi, %eax
+; X64-NEXT: addl %eax, %eax
+; X64-NEXT: sarw %ax
+; X64-NEXT: # kill: def $ax killed $ax killed $rax
; X64-NEXT: retq
;
; X86-LABEL: func2:
@@ -93,14 +92,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movsbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shll $14, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cltd
-; X86-NEXT: idivl %edi
-; X86-NEXT: leal -1(%eax), %esi
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: idivl %esi
+; X86-NEXT: leal -1(%eax), %edi
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: sets %bl
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sets %cl
@@ -108,10 +107,9 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X86-NEXT: testl %edx, %edx
; X86-NEXT: setne %dl
; X86-NEXT: testb %cl, %dl
-; X86-NEXT: cmovel %eax, %esi
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: movswl %si, %eax
-; X86-NEXT: shrl %eax
+; X86-NEXT: cmovnel %edi, %eax
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: sarw %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -127,10 +125,12 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
define i16 @func3(i15 %x, i8 %y) nounwind {
; X64-LABEL: func3:
; X64: # %bb.0:
-; X64-NEXT: shll $8, %esi
-; X64-NEXT: movswl %si, %ecx
; X64-NEXT: addl %edi, %edi
-; X64-NEXT: shrl $4, %ecx
+; X64-NEXT: sarw %di
+; X64-NEXT: movsbl %sil, %ecx
+; X64-NEXT: shlw $7, %cx
+; X64-NEXT: addw %di, %di
+; X64-NEXT: sarw $3, %cx
; X64-NEXT: movl %edi, %eax
; X64-NEXT: cwtd
; X64-NEXT: idivw %cx
@@ -144,43 +144,44 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
; X64-NEXT: testw %dx, %dx
; X64-NEXT: setne %dl
; X64-NEXT: testb %cl, %dl
-; X64-NEXT: cmovel %eax, %esi
-; X64-NEXT: addl %esi, %esi
-; X64-NEXT: movswl %si, %eax
-; X64-NEXT: shrl %eax
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: addl %eax, %eax
+; X64-NEXT: sarw %ax
+; X64-NEXT: # kill: def $ax killed $ax killed $rax
; X64-NEXT: retq
;
; X86-LABEL: func3:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shll $8, %eax
-; X86-NEXT: movswl %ax, %esi
; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: shrl $4, %esi
+; X86-NEXT: sarw %cx
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: shlw $7, %si
+; X86-NEXT: sarw $3, %si
+; X86-NEXT: addw %cx, %cx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cwtd
; X86-NEXT: idivw %si
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: leal -1(%eax), %edi
+; X86-NEXT: testw %si, %si
+; X86-NEXT: sets %bl
; X86-NEXT: testw %cx, %cx
; X86-NEXT: sets %cl
-; X86-NEXT: testw %si, %si
-; X86-NEXT: sets %ch
-; X86-NEXT: xorb %cl, %ch
+; X86-NEXT: xorb %bl, %cl
; X86-NEXT: testw %dx, %dx
-; X86-NEXT: setne %cl
-; X86-NEXT: testb %ch, %cl
-; X86-NEXT: cmovel %eax, %edi
-; X86-NEXT: addl %edi, %edi
-; X86-NEXT: movswl %di, %eax
-; X86-NEXT: shrl %eax
+; X86-NEXT: setne %dl
+; X86-NEXT: testb %cl, %dl
+; X86-NEXT: cmovnel %edi, %eax
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: sarw %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
%y2 = sext i8 %y to i15
%y3 = shl i15 %y2, 7
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e7727a0ab6178c..29fdfa6d790eb3 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -15,7 +15,8 @@ define i16 @func(i16 %x, i16 %y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movswl %si, %esi
; X64-NEXT: movswl %di, %ecx
-; X64-NEXT: shll $8, %ecx
+; X64-NEXT: addl %ecx, %ecx
+; X64-NEXT: shll $7, %ecx
; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cltd
; X64-NEXT: idivl %esi
@@ -47,7 +48,8 @@ define i16 @func(i16 %x, i16 %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shll $8, %ecx
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: shll $7, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cltd
; X86-NEXT: idivl %esi
@@ -150,10 +152,12 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
;
; X64-LABEL: func3:
; X64: # %bb.0:
-; X64-NEXT: shll $8, %esi
-; X64-NEXT: movswl %si, %ecx
; X64-NEXT: addl %edi, %edi
-; X64-NEXT: shrl $4, %ecx
+; X64-NEXT: sarw %di
+; X64-NEXT: movsbl %sil, %ecx
+; X64-NEXT: shlw $7, %cx
+; X64-NEXT: addw %di, %di
+; X64-NEXT: sarw $3, %cx
; X64-NEXT: movl %edi, %eax
; X64-NEXT: cwtd
; X64-NEXT: idivw %cx
@@ -181,27 +185,29 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
;
; X86-LABEL: func3:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shll $8, %eax
-; X86-NEXT: movswl %ax, %esi
; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: shrl $4, %esi
+; X86-NEXT: sarw %cx
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: shlw $7, %si
+; X86-NEXT: sarw $3, %si
+; X86-NEXT: addw %cx, %cx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cwtd
; X86-NEXT: idivw %si
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: leal -1(%eax), %edi
+; X86-NEXT: testw %si, %si
+; X86-NEXT: sets %bl
; X86-NEXT: testw %cx, %cx
; X86-NEXT: sets %cl
-; X86-NEXT: testw %si, %si
-; X86-NEXT: sets %ch
-; X86-NEXT: xorb %cl, %ch
+; X86-NEXT: xorb %bl, %cl
; X86-NEXT: testw %dx, %dx
-; X86-NEXT: setne %cl
-; X86-NEXT: testb %ch, %cl
+; X86-NEXT: setne %dl
+; X86-NEXT: testb %cl, %dl
; X86-NEXT: cmovnel %edi, %eax
; X86-NEXT: movswl %ax, %ecx
; X86-NEXT: cmpl $16383, %ecx # imm = 0x3FFF
@@ -214,6 +220,7 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
%y2 = sext i8 %y to i15
%y3 = shl i15 %y2, 7
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index ca5558561a65b9..67b3c34c225620 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -58,8 +58,9 @@ define i32 @test2() nounwind {
; GENERIC-NEXT: callq _return_false
; GENERIC-NEXT: xorl %ecx, %ecx
; GENERIC-NEXT: testb $1, %al
-; GENERIC-NEXT: movl $-3840, %eax ## imm = 0xF100
+; GENERIC-NEXT: movl $-480, %eax ## imm = 0xFE20
; GENERIC-NEXT: cmovnel %ecx, %eax
+; GENERIC-NEXT: shll $3, %eax
; GENERIC-NEXT: cmpl $32768, %eax ## imm = 0x8000
; GENERIC-NEXT: jge LBB1_1
; GENERIC-NEXT: ## %bb.2: ## %bb91
@@ -74,9 +75,10 @@ define i32 @test2() nounwind {
; ATOM-NEXT: pushq %rax
; ATOM-NEXT: callq _return_false
; ATOM-NEXT: xorl %ecx, %ecx
-; ATOM-NEXT: movl $-3840, %edx ## imm = 0xF100
+; ATOM-NEXT: movl $-480, %edx ## imm = 0xFE20
; ATOM-NEXT: testb $1, %al
; ATOM-NEXT: cmovnel %ecx, %edx
+; ATOM-NEXT: shll $3, %edx
; ATOM-NEXT: cmpl $32768, %edx ## imm = 0x8000
; ATOM-NEXT: jge LBB1_1
; ATOM-NEXT: ## %bb.2: ## %bb91
@@ -92,8 +94,9 @@ define i32 @test2() nounwind {
; ATHLON-NEXT: calll _return_false
; ATHLON-NEXT: xorl %ecx, %ecx
; ATHLON-NEXT: testb $1, %al
-; ATHLON-NEXT: movl $-3840, %eax ## imm = 0xF100
+; ATHLON-NEXT: movl $-480, %eax ## imm = 0xFE20
; ATHLON-NEXT: cmovnel %ecx, %eax
+; ATHLON-NEXT: shll $3, %eax
; ATHLON-NEXT: cmpl $32768, %eax ## imm = 0x8000
; ATHLON-NEXT: jge LBB1_1
; ATHLON-NEXT: ## %bb.2: ## %bb91
@@ -110,8 +113,9 @@ define i32 @test2() nounwind {
; MCU-NEXT: testb $1, %al
; MCU-NEXT: jne .LBB1_2
; MCU-NEXT: # %bb.1: # %entry
-; MCU-NEXT: movl $-3840, %ecx # imm = 0xF100
+; MCU-NEXT: movl $-480, %ecx # imm = 0xFE20
; MCU-NEXT: .LBB1_2: # %entry
+; MCU-NEXT: shll $3, %ecx
; MCU-NEXT: cmpl $32768, %ecx # imm = 0x8000
; MCU-NEXT: jge .LBB1_3
; MCU-NEXT: # %bb.4: # %bb91
@@ -415,16 +419,16 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
; ATHLON: ## %bb.0:
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax
; ATHLON-NEXT: notl %eax
-; ATHLON-NEXT: shrl $27, %eax
-; ATHLON-NEXT: andl $-16, %eax
+; ATHLON-NEXT: shrl $31, %eax
+; ATHLON-NEXT: shll $4, %eax
; ATHLON-NEXT: fldt {{\.?LCPI[0-9]+_[0-9]+}}(%eax)
; ATHLON-NEXT: retl
;
; MCU-LABEL: test7:
; MCU: # %bb.0:
; MCU-NEXT: notl %eax
-; MCU-NEXT: shrl $27, %eax
-; MCU-NEXT: andl $-16, %eax
+; MCU-NEXT: shrl $31, %eax
+; MCU-NEXT: shll $4, %eax
; MCU-NEXT: fldt {{\.?LCPI[0-9]+_[0-9]+}}(%eax)
; MCU-NEXT: retl
%tmp9 = icmp sgt i32 %tmp8, -1
diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll
index d604923b48a11a..20d67b43f64d4d 100644
--- a/llvm/test/CodeGen/X86/select_const.ll
+++ b/llvm/test/CodeGen/X86/select_const.ll
@@ -570,7 +570,7 @@ define i16 @select_pow2_diff_invert(i1 zeroext %cond) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorb $1, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: shll $6, %eax
+; X86-NEXT: shlw $6, %ax
; X86-NEXT: orl $7, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
@@ -579,7 +579,7 @@ define i16 @select_pow2_diff_invert(i1 zeroext %cond) {
; X64: # %bb.0:
; X64-NEXT: xorb $1, %dil
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: shll $6, %eax
+; X64-NEXT: shlw $6, %ax
; X64-NEXT: orl $7, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
index 03f4c0f61cdd1f..ad45570d9f3b8e 100644
--- a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
@@ -22,8 +22,8 @@ define i32 @neg_sel_special_constant(i32 %a) {
; ANY-LABEL: neg_sel_special_constant:
; ANY: # %bb.0:
; ANY-NEXT: movl %edi, %eax
-; ANY-NEXT: shrl $22, %eax
-; ANY-NEXT: andl $512, %eax # imm = 0x200
+; ANY-NEXT: shrl $31, %eax
+; ANY-NEXT: shll $9, %eax
; ANY-NEXT: retq
%tmp.1 = icmp slt i32 %a, 0
%retval = select i1 %tmp.1, i32 512, i32 0
@@ -82,8 +82,8 @@ define i32 @pos_sel_special_constant(i32 %a) {
; ANY: # %bb.0:
; ANY-NEXT: movl %edi, %eax
; ANY-NEXT: notl %eax
-; ANY-NEXT: shrl $22, %eax
-; ANY-NEXT: andl $512, %eax # imm = 0x200
+; ANY-NEXT: shrl $31, %eax
+; ANY-NEXT: shll $9, %eax
; ANY-NEXT: retq
%tmp.1 = icmp sgt i32 %a, -1
%retval = select i1 %tmp.1, i32 512, i32 0
@@ -159,6 +159,7 @@ define i8 @sel_shift_bool_i8(i1 %t) {
; ANY-LABEL: sel_shift_bool_i8:
; ANY: # %bb.0:
; ANY-NEXT: movl %edi, %eax
+; ANY-NEXT: andb $1, %al
; ANY-NEXT: shlb $7, %al
; ANY-NEXT: # kill: def $al killed $al killed $eax
; ANY-NEXT: retq
@@ -171,7 +172,7 @@ define i16 @sel_shift_bool_i16(i1 %t) {
; ANY: # %bb.0:
; ANY-NEXT: movl %edi, %eax
; ANY-NEXT: andl $1, %eax
-; ANY-NEXT: shll $7, %eax
+; ANY-NEXT: shlw $7, %ax
; ANY-NEXT: # kill: def $ax killed $ax killed $eax
; ANY-NEXT: retq
%shl = select i1 %t, i16 128, i16 0
@@ -194,7 +195,7 @@ define i64 @sel_shift_bool_i64(i1 %t) {
; ANY: # %bb.0:
; ANY-NEXT: movl %edi, %eax
; ANY-NEXT: andl $1, %eax
-; ANY-NEXT: shll $16, %eax
+; ANY-NEXT: shlq $16, %rax
; ANY-NEXT: retq
%shl = select i1 %t, i64 65536, i64 0
ret i64 %shl
diff --git a/llvm/test/CodeGen/X86/setcc.ll b/llvm/test/CodeGen/X86/setcc.ll
index 60ac6df3f77af2..66c225ea561535 100644
--- a/llvm/test/CodeGen/X86/setcc.ll
+++ b/llvm/test/CodeGen/X86/setcc.ll
@@ -64,7 +64,7 @@ define i64 @t3(i64 %x) nounwind readnone ssp {
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq $18, %rdi
; X64-NEXT: setb %al
-; X64-NEXT: shll $6, %eax
+; X64-NEXT: shlq $6, %rax
; X64-NEXT: retq
%t0 = icmp ult i64 %x, 18
%if = select i1 %t0, i64 64, i64 0
@@ -78,18 +78,24 @@ define i32 @t4(i32 %a) {
; X86: ## %bb.0:
; X86-NEXT: movl L_v4$non_lazy_ptr, %ecx
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl $1, (%ecx)
-; X86-NEXT: adcw $1, %ax
+; X86-NEXT: cmpl $0, (%ecx)
+; X86-NEXT: sete %al
+; X86-NEXT: shll $15, %eax
+; X86-NEXT: shrw $15, %ax
; X86-NEXT: shll $16, %eax
+; X86-NEXT: addl $65536, %eax ## imm = 0x10000
; X86-NEXT: retl
;
; X64-LABEL: t4:
; X64: ## %bb.0:
; X64-NEXT: movq _v4 at GOTPCREL(%rip), %rcx
; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: cmpl $1, (%rcx)
-; X64-NEXT: adcw $1, %ax
+; X64-NEXT: cmpl $0, (%rcx)
+; X64-NEXT: sete %al
+; X64-NEXT: shll $15, %eax
+; X64-NEXT: shrw $15, %ax
; X64-NEXT: shll $16, %eax
+; X64-NEXT: addl $65536, %eax ## imm = 0x10000
; X64-NEXT: retq
%t0 = load i32, ptr @v4, align 4
%not.tobool = icmp eq i32 %t0, 0
@@ -287,19 +293,13 @@ define i16 @shift_and(i16 %a) {
; X86-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
-; X64-NOTBM-LABEL: shift_and:
-; X64-NOTBM: ## %bb.0:
-; X64-NOTBM-NEXT: movl %edi, %eax
-; X64-NOTBM-NEXT: shrl $10, %eax
-; X64-NOTBM-NEXT: andl $1, %eax
-; X64-NOTBM-NEXT: ## kill: def $ax killed $ax killed $eax
-; X64-NOTBM-NEXT: retq
-;
-; X64-TBM-LABEL: shift_and:
-; X64-TBM: ## %bb.0:
-; X64-TBM-NEXT: bextrl $266, %edi, %eax ## imm = 0x10A
-; X64-TBM-NEXT: ## kill: def $ax killed $ax killed $eax
-; X64-TBM-NEXT: retq
+; X64-LABEL: shift_and:
+; X64: ## %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $1024, %eax ## imm = 0x400
+; X64-NEXT: shrw $10, %ax
+; X64-NEXT: ## kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
%and = and i16 %a, 1024
%cmp = icmp ne i16 %and, 0
%conv = zext i1 %cmp to i16
@@ -354,3 +354,6 @@ define i64 @pr63055(double %arg) {
%ext = zext i1 %fcmp to i64
ret i64 %ext
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64-NOTBM: {{.*}}
+; X64-TBM: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sext-subreg.ll b/llvm/test/CodeGen/X86/sext-subreg.ll
index 20451ff208cc05..ae530d92adc2c4 100644
--- a/llvm/test/CodeGen/X86/sext-subreg.ll
+++ b/llvm/test/CodeGen/X86/sext-subreg.ll
@@ -7,9 +7,11 @@ define i64 @t(i64 %A, i64 %B, ptr %P, ptr%P2) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addq %rsi, %rdi
; CHECK-NEXT: movl %edi, (%rdx)
-; CHECK-NEXT: movslq %edi, %rax
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: shlq $32, %rax
+; CHECK-NEXT: sarq $32, %rax
; CHECK-NEXT: movq %rax, (%rcx)
-; CHECK-NEXT: movl %eax, (%rdx)
+; CHECK-NEXT: movl %edi, (%rdx)
; CHECK-NEXT: retq
%C = add i64 %A, %B
%D = trunc i64 %C to i32
diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll
index e61ba4923f7928..b45697e087e942 100644
--- a/llvm/test/CodeGen/X86/shift-and.ll
+++ b/llvm/test/CodeGen/X86/shift-and.ll
@@ -209,9 +209,9 @@ define i64 @big_mask_constant(i64 %x) nounwind {
;
; X64-LABEL: big_mask_constant:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movabsq $17179869184, %rax # imm = 0x400000000
+; X64-NEXT: andq %rdi, %rax
; X64-NEXT: shrq $7, %rax
-; X64-NEXT: andl $134217728, %eax # imm = 0x8000000
; X64-NEXT: retq
%and = and i64 %x, 17179869184 ; 0x400000000
%sh = lshr i64 %and, 7
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index 30c3d53dd37c9e..e1c096c5781254 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -8,8 +8,9 @@ define dso_local i32 @test_lshr_and(i32 %x) {
; X86-LABEL: test_lshr_and:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $12, %eax
-; X86-NEXT: movl array(%eax), %eax
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $3, %eax
+; X86-NEXT: movl array(,%eax,4), %eax
; X86-NEXT: retl
;
; X64-LABEL: test_lshr_and:
@@ -30,7 +31,8 @@ define dso_local ptr @test_exact1(i32 %a, i32 %b, ptr %x) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sarl %eax
+; X86-NEXT: sarl $3, %eax
+; X86-NEXT: shll $2, %eax
; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
@@ -52,7 +54,8 @@ define dso_local ptr @test_exact2(i32 %a, i32 %b, ptr %x) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sarl %eax
+; X86-NEXT: sarl $3, %eax
+; X86-NEXT: shll $2, %eax
; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
@@ -74,6 +77,8 @@ define dso_local ptr @test_exact3(i32 %a, i32 %b, ptr %x) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: shll $2, %eax
; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
@@ -95,7 +100,8 @@ define dso_local ptr @test_exact4(i32 %a, i32 %b, ptr %x) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: shll $2, %eax
; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
@@ -117,7 +123,8 @@ define dso_local ptr @test_exact5(i32 %a, i32 %b, ptr %x) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: shll $2, %eax
; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
@@ -139,6 +146,8 @@ define dso_local ptr @test_exact6(i32 %a, i32 %b, ptr %x) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: shll $2, %eax
; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
@@ -167,8 +176,10 @@ define i64 @ashr_add_shl_i32(i64 %r) nounwind {
;
; X64-LABEL: ashr_add_shl_i32:
; X64: # %bb.0:
-; X64-NEXT: incl %edi
-; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: shlq $32, %rdi
+; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: sarq $32, %rax
; X64-NEXT: retq
%conv = shl i64 %r, 32
%sext = add i64 %conv, 4294967296
@@ -179,17 +190,20 @@ define i64 @ashr_add_shl_i32(i64 %r) nounwind {
define i64 @ashr_add_shl_i8(i64 %r) nounwind {
; X86-LABEL: ashr_add_shl_i8:
; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: addb $2, %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shll $24, %edx
+; X86-NEXT: addl $33554432, %edx # imm = 0x2000000
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: sarl $24, %eax
; X86-NEXT: sarl $31, %edx
; X86-NEXT: retl
;
; X64-LABEL: ashr_add_shl_i8:
; X64: # %bb.0:
-; X64-NEXT: addb $2, %dil
-; X64-NEXT: movsbq %dil, %rax
+; X64-NEXT: shlq $56, %rdi
+; X64-NEXT: movabsq $144115188075855872, %rax # imm = 0x200000000000000
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: sarq $56, %rax
; X64-NEXT: retq
%conv = shl i64 %r, 56
%sext = add i64 %conv, 144115188075855872
@@ -203,22 +217,26 @@ define <4 x i32> @ashr_add_shl_v4i8(<4 x i32> %r) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT: incb %dh
-; X86-NEXT: movsbl %dh, %esi
-; X86-NEXT: incb %ch
-; X86-NEXT: movsbl %ch, %edi
-; X86-NEXT: incb %dl
-; X86-NEXT: movsbl %dl, %edx
-; X86-NEXT: incb %cl
-; X86-NEXT: movsbl %cl, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: shll $24, %edi
+; X86-NEXT: shll $24, %esi
+; X86-NEXT: shll $24, %edx
+; X86-NEXT: shll $24, %ecx
+; X86-NEXT: addl $16777216, %ecx # imm = 0x1000000
+; X86-NEXT: addl $16777216, %edx # imm = 0x1000000
+; X86-NEXT: addl $16777216, %esi # imm = 0x1000000
+; X86-NEXT: addl $16777216, %edi # imm = 0x1000000
+; X86-NEXT: sarl $24, %edi
+; X86-NEXT: sarl $24, %esi
+; X86-NEXT: sarl $24, %edx
+; X86-NEXT: sarl $24, %ecx
; X86-NEXT: movl %ecx, 12(%eax)
; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %edi, 4(%eax)
-; X86-NEXT: movl %esi, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, (%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl $4
@@ -430,9 +448,10 @@ define i64 @ashr_add_neg_shl_i32(i64 %r) nounwind {
;
; X64-LABEL: ashr_add_neg_shl_i32:
; X64: # %bb.0:
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: subl %edi, %eax
-; X64-NEXT: cltq
+; X64-NEXT: shlq $32, %rdi
+; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT: subq %rdi, %rax
+; X64-NEXT: sarq $32, %rax
; X64-NEXT: retq
%conv = mul i64 %r, -4294967296
%sext = add i64 %conv, 4294967296
@@ -455,9 +474,10 @@ define i64 @ashr_add_neg_shl_i8(i64 %r) nounwind {
;
; X64-LABEL: ashr_add_neg_shl_i8:
; X64: # %bb.0:
-; X64-NEXT: movb $2, %al
-; X64-NEXT: subb %dil, %al
-; X64-NEXT: movsbq %al, %rax
+; X64-NEXT: shlq $56, %rdi
+; X64-NEXT: movabsq $144115188075855872, %rax # imm = 0x200000000000000
+; X64-NEXT: subq %rdi, %rax
+; X64-NEXT: sarq $56, %rax
; X64-NEXT: retq
%conv = mul i64 %r, -72057594037927936
%sext = add i64 %conv, 144115188075855872
@@ -469,27 +489,39 @@ define i64 @ashr_add_neg_shl_i8(i64 %r) nounwind {
define <4 x i32> @ashr_add_neg_shl_v4i8(<4 x i32> %r) nounwind {
; X86-LABEL: ashr_add_neg_shl_v4i8:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movb $1, %cl
-; X86-NEXT: movb $1, %dl
-; X86-NEXT: subb {{[0-9]+}}(%esp), %dl
-; X86-NEXT: movsbl %dl, %edx
-; X86-NEXT: movb $1, %ch
-; X86-NEXT: subb {{[0-9]+}}(%esp), %ch
-; X86-NEXT: movsbl %ch, %esi
-; X86-NEXT: movb $1, %ch
-; X86-NEXT: subb {{[0-9]+}}(%esp), %ch
-; X86-NEXT: movsbl %ch, %edi
-; X86-NEXT: subb {{[0-9]+}}(%esp), %cl
-; X86-NEXT: movsbl %cl, %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shll $24, %edx
+; X86-NEXT: shll $24, %esi
+; X86-NEXT: shll $24, %ebx
+; X86-NEXT: shll $24, %ebp
+; X86-NEXT: movl $16777216, %ecx # imm = 0x1000000
+; X86-NEXT: movl $16777216, %edi # imm = 0x1000000
+; X86-NEXT: subl %ebp, %edi
+; X86-NEXT: movl $16777216, %ebp # imm = 0x1000000
+; X86-NEXT: subl %ebx, %ebp
+; X86-NEXT: movl $16777216, %ebx # imm = 0x1000000
+; X86-NEXT: subl %esi, %ebx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: sarl $24, %ecx
+; X86-NEXT: sarl $24, %ebx
+; X86-NEXT: sarl $24, %ebp
+; X86-NEXT: sarl $24, %edi
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %ebp, 8(%eax)
+; X86-NEXT: movl %ebx, 4(%eax)
+; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl $4
;
; X64-LABEL: ashr_add_neg_shl_v4i8:
diff --git a/llvm/test/CodeGen/X86/shift-folding.ll b/llvm/test/CodeGen/X86/shift-folding.ll
index c4be7d990cbaa2..5802e36e71239f 100644
--- a/llvm/test/CodeGen/X86/shift-folding.ll
+++ b/llvm/test/CodeGen/X86/shift-folding.ll
@@ -5,7 +5,8 @@ define ptr @test1(ptr %P, i32 %X) {
; CHECK-LABEL: test1:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: andl $-4, %eax
+; CHECK-NEXT: shrl $2, %eax
+; CHECK-NEXT: shll $2, %eax
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: retl
%Y = lshr i32 %X, 2
@@ -31,7 +32,8 @@ define ptr @test3(ptr %P, i32 %X) {
; CHECK-LABEL: test3:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: andl $-4, %eax
+; CHECK-NEXT: shrl $2, %eax
+; CHECK-NEXT: shll $2, %eax
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: retl
%Y = ashr i32 %X, 2
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 4fbe05cd1b2f2f..577c2ae305022b 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -917,33 +917,50 @@ define <2 x i256> @shl_zext_lshr_outofrange(<2 x i128> %a0) {
define i128 @lshr_shl_mask(i128 %a0) {
; i686-LABEL: lshr_shl_mask:
; i686: # %bb.0:
-; i686-NEXT: pushl %edi
+; i686-NEXT: pushl %ebx
; i686-NEXT: .cfi_def_cfa_offset 8
-; i686-NEXT: pushl %esi
+; i686-NEXT: pushl %edi
; i686-NEXT: .cfi_def_cfa_offset 12
-; i686-NEXT: .cfi_offset %esi, -12
-; i686-NEXT: .cfi_offset %edi, -8
+; i686-NEXT: pushl %esi
+; i686-NEXT: .cfi_def_cfa_offset 16
+; i686-NEXT: .cfi_offset %esi, -16
+; i686-NEXT: .cfi_offset %edi, -12
+; i686-NEXT: .cfi_offset %ebx, -8
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF
-; i686-NEXT: andl {{[0-9]+}}(%esp), %edi
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT: addl %edi, %edi
+; i686-NEXT: leal (%esi,%esi), %ebx
+; i686-NEXT: shrl $31, %esi
+; i686-NEXT: shldl $31, %ebx, %esi
+; i686-NEXT: leal (%edx,%edx), %ebx
+; i686-NEXT: shrl $31, %edx
+; i686-NEXT: shldl $31, %ebx, %edx
+; i686-NEXT: leal (%ecx,%ecx), %ebx
+; i686-NEXT: shrl $31, %ecx
+; i686-NEXT: shldl $31, %ebx, %ecx
+; i686-NEXT: shrl %edi
; i686-NEXT: movl %edi, 12(%eax)
; i686-NEXT: movl %esi, 8(%eax)
; i686-NEXT: movl %edx, 4(%eax)
; i686-NEXT: movl %ecx, (%eax)
; i686-NEXT: popl %esi
-; i686-NEXT: .cfi_def_cfa_offset 8
+; i686-NEXT: .cfi_def_cfa_offset 12
; i686-NEXT: popl %edi
+; i686-NEXT: .cfi_def_cfa_offset 8
+; i686-NEXT: popl %ebx
; i686-NEXT: .cfi_def_cfa_offset 4
; i686-NEXT: retl $4
;
; x86_64-LABEL: lshr_shl_mask:
; x86_64: # %bb.0:
-; x86_64-NEXT: movq %rdi, %rax
-; x86_64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; x86_64-NEXT: andq %rsi, %rdx
+; x86_64-NEXT: leaq (%rsi,%rsi), %rdx
+; x86_64-NEXT: leaq (%rdi,%rdi), %rax
+; x86_64-NEXT: shrq $63, %rdi
+; x86_64-NEXT: shrdq $1, %rdi, %rax
+; x86_64-NEXT: shrq %rdx
; x86_64-NEXT: retq
%1 = shl i128 %a0, 1
%2 = lshr i128 %1, 1
diff --git a/llvm/test/CodeGen/X86/shift-mask.ll b/llvm/test/CodeGen/X86/shift-mask.ll
index 604f5c19b92e5d..ffaf64df20795e 100644
--- a/llvm/test/CodeGen/X86/shift-mask.ll
+++ b/llvm/test/CodeGen/X86/shift-mask.ll
@@ -21,13 +21,15 @@ define i8 @test_i8_shl_lshr_0(i8 %a0) {
; X86-LABEL: test_i8_shl_lshr_0:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andb $-8, %al
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: shlb $3, %al
; X86-NEXT: retl
;
; X64-LABEL: test_i8_shl_lshr_0:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andb $-8, %al
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shrb $3, %dil
+; X64-NEXT: leal (,%rdi,8), %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%1 = lshr i8 %a0, 3
@@ -67,25 +69,17 @@ define i8 @test_i8_shl_lshr_2(i8 %a0) {
; X86-LABEL: test_i8_shl_lshr_2:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrb $2, %al
-; X86-NEXT: andb $56, %al
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: shlb $3, %al
; X86-NEXT: retl
;
-; X64-MASK-LABEL: test_i8_shl_lshr_2:
-; X64-MASK: # %bb.0:
-; X64-MASK-NEXT: movl %edi, %eax
-; X64-MASK-NEXT: shrb $2, %al
-; X64-MASK-NEXT: andb $56, %al
-; X64-MASK-NEXT: # kill: def $al killed $al killed $eax
-; X64-MASK-NEXT: retq
-;
-; X64-SHIFT-LABEL: test_i8_shl_lshr_2:
-; X64-SHIFT: # %bb.0:
-; X64-SHIFT-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-SHIFT-NEXT: shrb $5, %dil
-; X64-SHIFT-NEXT: leal (,%rdi,8), %eax
-; X64-SHIFT-NEXT: # kill: def $al killed $al killed $eax
-; X64-SHIFT-NEXT: retq
+; X64-LABEL: test_i8_shl_lshr_2:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shrb $5, %dil
+; X64-NEXT: leal (,%rdi,8), %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
%1 = lshr i8 %a0, 5
%2 = shl i8 %1, 3
ret i8 %2
@@ -95,14 +89,16 @@ define i16 @test_i16_shl_lshr_0(i16 %a0) {
; X86-LABEL: test_i16_shl_lshr_0:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $-8, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: shlw $3, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_shl_lshr_0:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $65528, %eax # imm = 0xFFF8
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: shlw $3, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%1 = lshr i16 %a0, 3
@@ -143,26 +139,18 @@ define i16 @test_i16_shl_lshr_2(i16 %a0) {
; X86-LABEL: test_i16_shl_lshr_2:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $2, %eax
-; X86-NEXT: andl $-8, %eax
+; X86-NEXT: shrl $5, %eax
+; X86-NEXT: shlw $3, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
-; X64-MASK-LABEL: test_i16_shl_lshr_2:
-; X64-MASK: # %bb.0:
-; X64-MASK-NEXT: movl %edi, %eax
-; X64-MASK-NEXT: shrl $2, %eax
-; X64-MASK-NEXT: andl $16376, %eax # imm = 0x3FF8
-; X64-MASK-NEXT: # kill: def $ax killed $ax killed $eax
-; X64-MASK-NEXT: retq
-;
-; X64-SHIFT-LABEL: test_i16_shl_lshr_2:
-; X64-SHIFT: # %bb.0:
-; X64-SHIFT-NEXT: movzwl %di, %eax
-; X64-SHIFT-NEXT: shrl $5, %eax
-; X64-SHIFT-NEXT: shll $3, %eax
-; X64-SHIFT-NEXT: # kill: def $ax killed $ax killed $eax
-; X64-SHIFT-NEXT: retq
+; X64-LABEL: test_i16_shl_lshr_2:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: shrl $5, %eax
+; X64-NEXT: shlw $3, %ax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
%1 = lshr i16 %a0, 5
%2 = shl i16 %1, 3
ret i16 %2
@@ -172,13 +160,15 @@ define i32 @test_i32_shl_lshr_0(i32 %a0) {
; X86-LABEL: test_i32_shl_lshr_0:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $-8, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: shll $3, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i32_shl_lshr_0:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $-8, %eax
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shrl $3, %edi
+; X64-NEXT: leal (,%rdi,8), %eax
; X64-NEXT: retq
%1 = lshr i32 %a0, 3
%2 = shl i32 %1, 3
@@ -215,23 +205,16 @@ define i32 @test_i32_shl_lshr_2(i32 %a0) {
; X86-LABEL: test_i32_shl_lshr_2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $2, %eax
-; X86-NEXT: andl $-8, %eax
+; X86-NEXT: shrl $5, %eax
+; X86-NEXT: shll $3, %eax
; X86-NEXT: retl
;
-; X64-MASK-LABEL: test_i32_shl_lshr_2:
-; X64-MASK: # %bb.0:
-; X64-MASK-NEXT: movl %edi, %eax
-; X64-MASK-NEXT: shrl $2, %eax
-; X64-MASK-NEXT: andl $-8, %eax
-; X64-MASK-NEXT: retq
-;
-; X64-SHIFT-LABEL: test_i32_shl_lshr_2:
-; X64-SHIFT: # %bb.0:
-; X64-SHIFT-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-SHIFT-NEXT: shrl $5, %edi
-; X64-SHIFT-NEXT: leal (,%rdi,8), %eax
-; X64-SHIFT-NEXT: retq
+; X64-LABEL: test_i32_shl_lshr_2:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shrl $5, %edi
+; X64-NEXT: leal (,%rdi,8), %eax
+; X64-NEXT: retq
%1 = lshr i32 %a0, 5
%2 = shl i32 %1, 3
ret i32 %2
@@ -242,13 +225,18 @@ define i64 @test_i64_shl_lshr_0(i64 %a0) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl $-8, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: shll $29, %ecx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: shldl $3, %ecx, %edx
+; X86-NEXT: shll $3, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_i64_shl_lshr_0:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: andq $-8, %rax
+; X64-NEXT: shrq $3, %rdi
+; X64-NEXT: leaq (,%rdi,8), %rax
; X64-NEXT: retq
%1 = lshr i64 %a0, 3
%2 = shl i64 %1, 3
@@ -287,23 +275,19 @@ define i64 @test_i64_shl_lshr_2(i64 %a0) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: shrdl $2, %edx, %eax
-; X86-NEXT: shrl $2, %edx
-; X86-NEXT: andl $-8, %eax
+; X86-NEXT: shrdl $5, %edx, %eax
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: shll $27, %ecx
+; X86-NEXT: shrl $5, %edx
+; X86-NEXT: shldl $3, %ecx, %edx
+; X86-NEXT: shll $3, %eax
; X86-NEXT: retl
;
-; X64-MASK-LABEL: test_i64_shl_lshr_2:
-; X64-MASK: # %bb.0:
-; X64-MASK-NEXT: movq %rdi, %rax
-; X64-MASK-NEXT: shrq $2, %rax
-; X64-MASK-NEXT: andq $-8, %rax
-; X64-MASK-NEXT: retq
-;
-; X64-SHIFT-LABEL: test_i64_shl_lshr_2:
-; X64-SHIFT: # %bb.0:
-; X64-SHIFT-NEXT: shrq $5, %rdi
-; X64-SHIFT-NEXT: leaq (,%rdi,8), %rax
-; X64-SHIFT-NEXT: retq
+; X64-LABEL: test_i64_shl_lshr_2:
+; X64: # %bb.0:
+; X64-NEXT: shrq $5, %rdi
+; X64-NEXT: leaq (,%rdi,8), %rax
+; X64-NEXT: retq
%1 = lshr i64 %a0, 5
%2 = shl i64 %1, 3
ret i64 %2
@@ -319,13 +303,15 @@ define i8 @test_i8_lshr_lshr_0(i8 %a0) {
; X86-LABEL: test_i8_lshr_lshr_0:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andb $31, %al
+; X86-NEXT: shlb $3, %al
+; X86-NEXT: shrb $3, %al
; X86-NEXT: retl
;
; X64-LABEL: test_i8_lshr_lshr_0:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andb $31, %al
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal (,%rdi,8), %eax
+; X64-NEXT: shrb $3, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%1 = shl i8 %a0, 3
@@ -365,25 +351,17 @@ define i8 @test_i8_lshr_lshr_2(i8 %a0) {
; X86-LABEL: test_i8_lshr_lshr_2:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shlb $2, %al
-; X86-NEXT: andb $28, %al
+; X86-NEXT: shlb $5, %al
+; X86-NEXT: shrb $3, %al
; X86-NEXT: retl
;
-; X64-MASK-LABEL: test_i8_lshr_lshr_2:
-; X64-MASK: # %bb.0:
-; X64-MASK-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-MASK-NEXT: leal (,%rdi,4), %eax
-; X64-MASK-NEXT: andb $28, %al
-; X64-MASK-NEXT: # kill: def $al killed $al killed $eax
-; X64-MASK-NEXT: retq
-;
-; X64-SHIFT-LABEL: test_i8_lshr_lshr_2:
-; X64-SHIFT: # %bb.0:
-; X64-SHIFT-NEXT: movl %edi, %eax
-; X64-SHIFT-NEXT: shlb $5, %al
-; X64-SHIFT-NEXT: shrb $3, %al
-; X64-SHIFT-NEXT: # kill: def $al killed $al killed $eax
-; X64-SHIFT-NEXT: retq
+; X64-LABEL: test_i8_lshr_lshr_2:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shlb $5, %al
+; X64-NEXT: shrb $3, %al
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
%1 = shl i8 %a0, 5
%2 = lshr i8 %1, 3
ret i8 %2
@@ -392,15 +370,17 @@ define i8 @test_i8_lshr_lshr_2(i8 %a0) {
define i16 @test_i16_lshr_lshr_0(i16 %a0) {
; X86-LABEL: test_i16_lshr_lshr_0:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $8191, %eax # imm = 0x1FFF
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $3, %eax
+; X86-NEXT: shrw $3, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_lshr_lshr_0:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $8191, %eax # imm = 0x1FFF
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal (,%rdi,8), %eax
+; X64-NEXT: shrw $3, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%1 = shl i16 %a0, 3
@@ -454,16 +434,16 @@ define i16 @test_i16_lshr_lshr_2(i16 %a0) {
; X86-LABEL: test_i16_lshr_lshr_2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shll $2, %eax
-; X86-NEXT: andl $8188, %eax # imm = 0x1FFC
+; X86-NEXT: shll $5, %eax
+; X86-NEXT: shrw $3, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test_i16_lshr_lshr_2:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: leal (,%rdi,4), %eax
-; X64-NEXT: andl $8188, %eax # imm = 0x1FFC
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shll $5, %eax
+; X64-NEXT: shrw $3, %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%1 = shl i16 %a0, 5
@@ -474,15 +454,39 @@ define i16 @test_i16_lshr_lshr_2(i16 %a0) {
define i32 @test_i32_lshr_lshr_0(i32 %a0) {
; X86-LABEL: test_i32_lshr_lshr_0:
; X86: # %bb.0:
-; X86-NEXT: movl $536870911, %eax # imm = 0x1FFFFFFF
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $3, %eax
+; X86-NEXT: shrl $3, %eax
; X86-NEXT: retl
;
-; X64-LABEL: test_i32_lshr_lshr_0:
-; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $536870911, %eax # imm = 0x1FFFFFFF
-; X64-NEXT: retq
+; X64-MASK-LABEL: test_i32_lshr_lshr_0:
+; X64-MASK: # %bb.0:
+; X64-MASK-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-MASK-NEXT: leal (,%rdi,8), %eax
+; X64-MASK-NEXT: shrl $3, %eax
+; X64-MASK-NEXT: retq
+;
+; X64-SHIFT2-LABEL: test_i32_lshr_lshr_0:
+; X64-SHIFT2: # %bb.0:
+; X64-SHIFT2-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-SHIFT2-NEXT: leal (,%rdi,8), %eax
+; X64-SHIFT2-NEXT: shrl $3, %eax
+; X64-SHIFT2-NEXT: retq
+;
+; X64-BMI1-LABEL: test_i32_lshr_lshr_0:
+; X64-BMI1: # %bb.0:
+; X64-BMI1-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-BMI1-NEXT: leal (,%rdi,8), %eax
+; X64-BMI1-NEXT: shrl $3, %eax
+; X64-BMI1-NEXT: retq
+;
+; X64-BMI2-LABEL: test_i32_lshr_lshr_0:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movb $3, %al
+; X64-BMI2-NEXT: movl $32, %ecx
+; X64-BMI2-NEXT: subl %eax, %ecx
+; X64-BMI2-NEXT: bzhil %ecx, %edi, %eax
+; X64-BMI2-NEXT: retq
%1 = shl i32 %a0, 3
%2 = lshr i32 %1, 3
ret i32 %2
@@ -518,23 +522,16 @@ define i32 @test_i32_lshr_lshr_2(i32 %a0) {
; X86-LABEL: test_i32_lshr_lshr_2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shll $2, %eax
-; X86-NEXT: andl $536870908, %eax # imm = 0x1FFFFFFC
+; X86-NEXT: shll $5, %eax
+; X86-NEXT: shrl $3, %eax
; X86-NEXT: retl
;
-; X64-MASK-LABEL: test_i32_lshr_lshr_2:
-; X64-MASK: # %bb.0:
-; X64-MASK-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-MASK-NEXT: leal (,%rdi,4), %eax
-; X64-MASK-NEXT: andl $536870908, %eax # imm = 0x1FFFFFFC
-; X64-MASK-NEXT: retq
-;
-; X64-SHIFT-LABEL: test_i32_lshr_lshr_2:
-; X64-SHIFT: # %bb.0:
-; X64-SHIFT-NEXT: movl %edi, %eax
-; X64-SHIFT-NEXT: shll $5, %eax
-; X64-SHIFT-NEXT: shrl $3, %eax
-; X64-SHIFT-NEXT: retq
+; X64-LABEL: test_i32_lshr_lshr_2:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shll $5, %eax
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: retq
%1 = shl i32 %a0, 5
%2 = lshr i32 %1, 3
ret i32 %2
@@ -544,37 +541,38 @@ define i64 @test_i64_lshr_lshr_0(i64 %a0) {
; X86-LABEL: test_i64_lshr_lshr_0:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl $536870911, %edx # imm = 0x1FFFFFFF
-; X86-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shll $3, %edx
+; X86-NEXT: leal (,%eax,8), %ecx
+; X86-NEXT: shrl $29, %eax
+; X86-NEXT: shldl $29, %ecx, %eax
+; X86-NEXT: shrl $3, %edx
; X86-NEXT: retl
;
; X64-MASK-LABEL: test_i64_lshr_lshr_0:
; X64-MASK: # %bb.0:
-; X64-MASK-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF
-; X64-MASK-NEXT: andq %rdi, %rax
+; X64-MASK-NEXT: leaq (,%rdi,8), %rax
+; X64-MASK-NEXT: shrq $3, %rax
; X64-MASK-NEXT: retq
;
; X64-SHIFT2-LABEL: test_i64_lshr_lshr_0:
; X64-SHIFT2: # %bb.0:
-; X64-SHIFT2-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF
-; X64-SHIFT2-NEXT: andq %rdi, %rax
+; X64-SHIFT2-NEXT: leaq (,%rdi,8), %rax
+; X64-SHIFT2-NEXT: shrq $3, %rax
; X64-SHIFT2-NEXT: retq
;
-; X64-TBM-LABEL: test_i64_lshr_lshr_0:
-; X64-TBM: # %bb.0:
-; X64-TBM-NEXT: bextrq $15616, %rdi, %rax # imm = 0x3D00
-; X64-TBM-NEXT: retq
-;
; X64-BMI1-LABEL: test_i64_lshr_lshr_0:
; X64-BMI1: # %bb.0:
-; X64-BMI1-NEXT: movl $15616, %eax # imm = 0x3D00
-; X64-BMI1-NEXT: bextrq %rax, %rdi, %rax
+; X64-BMI1-NEXT: leaq (,%rdi,8), %rax
+; X64-BMI1-NEXT: shrq $3, %rax
; X64-BMI1-NEXT: retq
;
; X64-BMI2-LABEL: test_i64_lshr_lshr_0:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movb $61, %al
-; X64-BMI2-NEXT: bzhiq %rax, %rdi, %rax
+; X64-BMI2-NEXT: movb $3, %al
+; X64-BMI2-NEXT: movl $64, %ecx
+; X64-BMI2-NEXT: subl %eax, %ecx
+; X64-BMI2-NEXT: bzhiq %rcx, %rdi, %rax
; X64-BMI2-NEXT: retq
%1 = shl i64 %a0, 3
%2 = lshr i64 %1, 3
@@ -613,24 +611,20 @@ define i64 @test_i64_lshr_lshr_2(i64 %a0) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: shldl $2, %eax, %edx
-; X86-NEXT: shll $2, %eax
-; X86-NEXT: andl $536870911, %edx # imm = 0x1FFFFFFF
+; X86-NEXT: shldl $5, %eax, %edx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: shll $5, %ecx
+; X86-NEXT: shrl $27, %eax
+; X86-NEXT: shldl $29, %ecx, %eax
+; X86-NEXT: shrl $3, %edx
; X86-NEXT: retl
;
-; X64-MASK-LABEL: test_i64_lshr_lshr_2:
-; X64-MASK: # %bb.0:
-; X64-MASK-NEXT: leaq (,%rdi,4), %rcx
-; X64-MASK-NEXT: movabsq $2305843009213693948, %rax # imm = 0x1FFFFFFFFFFFFFFC
-; X64-MASK-NEXT: andq %rcx, %rax
-; X64-MASK-NEXT: retq
-;
-; X64-SHIFT-LABEL: test_i64_lshr_lshr_2:
-; X64-SHIFT: # %bb.0:
-; X64-SHIFT-NEXT: movq %rdi, %rax
-; X64-SHIFT-NEXT: shlq $5, %rax
-; X64-SHIFT-NEXT: shrq $3, %rax
-; X64-SHIFT-NEXT: retq
+; X64-LABEL: test_i64_lshr_lshr_2:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shlq $5, %rax
+; X64-NEXT: shrq $3, %rax
+; X64-NEXT: retq
%1 = shl i64 %a0, 5
%2 = lshr i64 %1, 3
ret i64 %2
diff --git a/llvm/test/CodeGen/X86/shift-pair.ll b/llvm/test/CodeGen/X86/shift-pair.ll
index d809f9fcbfcc63..912b02f11de282 100644
--- a/llvm/test/CodeGen/X86/shift-pair.ll
+++ b/llvm/test/CodeGen/X86/shift-pair.ll
@@ -4,9 +4,8 @@
define i64 @test(i64 %A) {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq $54, %rax
-; CHECK-NEXT: andl $-4, %eax
+; CHECK-NEXT: shrq $56, %rdi
+; CHECK-NEXT: leaq (,%rdi,4), %rax
; CHECK-NEXT: retq
%B = lshr i64 %A, 56
%C = shl i64 %B, 2
diff --git a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
index 254b8fe3fc6e30..be164bb680dd25 100644
--- a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
+++ b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
@@ -102,6 +102,8 @@ define dso_local void @test5(i32 %X) nounwind !prof !14 {
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movzwl x+4(%rip), %ecx
; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: shlq $32, %rcx
+; CHECK-NEXT: shrq $32, %rcx
; CHECK-NEXT: cmpl $1, %ecx
; CHECK-NEXT: jne bar # TAILCALL
; CHECK-NEXT: # %bb.1: # %if.end
diff --git a/llvm/test/CodeGen/X86/shrink-compare.ll b/llvm/test/CodeGen/X86/shrink-compare.ll
index 840167ff9f4a0c..4819688c392795 100644
--- a/llvm/test/CodeGen/X86/shrink-compare.ll
+++ b/llvm/test/CodeGen/X86/shrink-compare.ll
@@ -102,6 +102,8 @@ define dso_local void @test5(i32 %X) nounwind minsize {
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movzwl x+4(%rip), %ecx
; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: shlq $32, %rcx
+; CHECK-NEXT: shrq $32, %rcx
; CHECK-NEXT: cmpl $1, %ecx
; CHECK-NEXT: jne bar # TAILCALL
; CHECK-NEXT: # %bb.1: # %if.end
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 85c966c447fad6..0d68b64510f88d 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -460,6 +460,8 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
; X64-NEXT: shlb $4, %sil
; X64-NEXT: sarb $4, %sil
; X64-NEXT: shlb $4, %al
+; X64-NEXT: sarb $4, %al
+; X64-NEXT: shlb $4, %al
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: movl %eax, %edx
; X64-NEXT: xorb %sil, %dl
@@ -480,6 +482,8 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
; X86-NEXT: sarb $4, %cl
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shlb $4, %al
+; X86-NEXT: sarb $4, %al
+; X86-NEXT: shlb $4, %al
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: movb %al, %ah
; X86-NEXT: xorb %cl, %ah
diff --git a/llvm/test/CodeGen/X86/split-store.ll b/llvm/test/CodeGen/X86/split-store.ll
index 416c0cbeeddbd1..821571489a7f48 100644
--- a/llvm/test/CodeGen/X86/split-store.ll
+++ b/llvm/test/CodeGen/X86/split-store.ll
@@ -197,7 +197,7 @@ define void @int12_int12_pair(i12 signext %tmp1, i12 signext %tmp2, ptr %ref.tmp
define void @int7_int7_pair(i7 signext %tmp1, i7 signext %tmp2, ptr %ref.tmp) {
; CHECK-LABEL: int7_int7_pair:
; CHECK: # %bb.0:
-; CHECK-NEXT: shll $7, %esi
+; CHECK-NEXT: shlw $7, %si
; CHECK-NEXT: andl $127, %edi
; CHECK-NEXT: orl %esi, %edi
; CHECK-NEXT: andl $16383, %edi # imm = 0x3FFF
diff --git a/llvm/test/CodeGen/X86/srem-lkk.ll b/llvm/test/CodeGen/X86/srem-lkk.ll
index ae30ae4463a93f..2d95ee05a8ce46 100644
--- a/llvm/test/CodeGen/X86/srem-lkk.ll
+++ b/llvm/test/CodeGen/X86/srem-lkk.ll
@@ -106,7 +106,8 @@ define i32 @dont_fold_srem_power_of_two(i32 %x) {
; CHECK-NEXT: leal 63(%rax), %ecx
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovnsl %edi, %ecx
-; CHECK-NEXT: andl $-64, %ecx
+; CHECK-NEXT: sarl $6, %ecx
+; CHECK-NEXT: shll $6, %ecx
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
@@ -132,7 +133,8 @@ define i32 @dont_fold_srem_i32_smax(i32 %x) {
; CHECK-NEXT: leal 2147483647(%rdi), %eax
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovnsl %edi, %eax
-; CHECK-NEXT: andl $-2147483648, %eax # imm = 0x80000000
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: shll $31, %eax
; CHECK-NEXT: addl %edi, %eax
; CHECK-NEXT: retq
%1 = srem i32 %x, 2147483648
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index d644ed87c3c108..5b145b4b5a24eb 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -83,10 +83,11 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shlb $2, %cl
; X86-NEXT: sarb $5, %cl
+; X86-NEXT: andb $48, %cl
; X86-NEXT: shrb $4, %cl
-; X86-NEXT: andb $3, %cl
; X86-NEXT: addb %al, %cl
-; X86-NEXT: andb $60, %cl
+; X86-NEXT: shrb $2, %cl
+; X86-NEXT: shlb $2, %cl
; X86-NEXT: subb %cl, %al
; X86-NEXT: testb $63, %al
; X86-NEXT: setne %al
@@ -97,10 +98,11 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: leal (,%rdi,4), %eax
; X64-NEXT: sarb $5, %al
+; X64-NEXT: andb $48, %al
; X64-NEXT: shrb $4, %al
-; X64-NEXT: andb $3, %al
; X64-NEXT: addb %dil, %al
-; X64-NEXT: andb $60, %al
+; X64-NEXT: shrb $2, %al
+; X64-NEXT: shlb $2, %al
; X64-NEXT: subb %al, %dil
; X64-NEXT: testb $63, %dil
; X64-NEXT: setne %al
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
index d2a1e5e4281299..31b54560d849ec 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
@@ -554,49 +554,26 @@ define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
; CHECK-SSE-NEXT: psrad $31, %xmm1
; CHECK-SSE-NEXT: psrld $28, %xmm1
; CHECK-SSE-NEXT: paddd %xmm0, %xmm1
-; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT: psrld $4, %xmm1
+; CHECK-SSE-NEXT: pslld $4, %xmm1
; CHECK-SSE-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE-NEXT: psrld $31, %xmm0
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_srem_pow2:
-; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: retq
-;
-; CHECK-AVX2-LABEL: test_srem_pow2:
-; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $28, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967280,4294967280,4294967280,4294967280]
-; CHECK-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: retq
-;
-; CHECK-AVX512VL-LABEL: test_srem_pow2:
-; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpsrad $31, %xmm0, %xmm1
-; CHECK-AVX512VL-NEXT: vpsrld $28, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: retq
+; CHECK-AVX-LABEL: test_srem_pow2:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vpsrad $31, %xmm0, %xmm1
+; CHECK-AVX-NEXT: vpsrld $28, %xmm1, %xmm1
+; CHECK-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; CHECK-AVX-NEXT: vpsrld $4, %xmm1, %xmm1
+; CHECK-AVX-NEXT: vpslld $4, %xmm1, %xmm1
+; CHECK-AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
%srem = srem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
@@ -611,49 +588,26 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
; CHECK-SSE-NEXT: psrad $31, %xmm1
; CHECK-SSE-NEXT: psrld $1, %xmm1
; CHECK-SSE-NEXT: paddd %xmm0, %xmm1
-; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT: psrld $31, %xmm1
+; CHECK-SSE-NEXT: pslld $31, %xmm1
; CHECK-SSE-NEXT: paddd %xmm1, %xmm0
; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE-NEXT: psrld $31, %xmm0
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_srem_int_min:
-; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: retq
-;
-; CHECK-AVX2-LABEL: test_srem_int_min:
-; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: retq
-;
-; CHECK-AVX512VL-LABEL: test_srem_int_min:
-; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpsrad $31, %xmm0, %xmm1
-; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: retq
+; CHECK-AVX-LABEL: test_srem_int_min:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vpsrad $31, %xmm0, %xmm1
+; CHECK-AVX-NEXT: vpsrld $1, %xmm1, %xmm1
+; CHECK-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; CHECK-AVX-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX-NEXT: vpslld $31, %xmm1, %xmm1
+; CHECK-AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
%srem = srem <4 x i32> %X, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/srem-seteq.ll b/llvm/test/CodeGen/X86/srem-seteq.ll
index dfa1472b62fe55..9aaadcb1190355 100644
--- a/llvm/test/CodeGen/X86/srem-seteq.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq.ll
@@ -321,7 +321,8 @@ define i32 @test_srem_pow2(i32 %X) nounwind {
; X86-NEXT: leal 15(%ecx), %edx
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: cmovnsl %ecx, %edx
-; X86-NEXT: andl $-16, %edx
+; X86-NEXT: sarl $4, %edx
+; X86-NEXT: shll $4, %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: sete %al
@@ -333,7 +334,8 @@ define i32 @test_srem_pow2(i32 %X) nounwind {
; X64-NEXT: leal 15(%rdi), %ecx
; X64-NEXT: testl %edi, %edi
; X64-NEXT: cmovnsl %edi, %ecx
-; X64-NEXT: andl $-16, %ecx
+; X64-NEXT: sarl $4, %ecx
+; X64-NEXT: shll $4, %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl %ecx, %edi
; X64-NEXT: sete %al
@@ -352,7 +354,8 @@ define i32 @test_srem_int_min(i32 %X) nounwind {
; X86-NEXT: leal 2147483647(%ecx), %edx
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: cmovnsl %ecx, %edx
-; X86-NEXT: andl $-2147483648, %edx # imm = 0x80000000
+; X86-NEXT: shrl $31, %edx
+; X86-NEXT: shll $31, %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: sete %al
@@ -364,7 +367,8 @@ define i32 @test_srem_int_min(i32 %X) nounwind {
; X64-NEXT: leal 2147483647(%rdi), %ecx
; X64-NEXT: testl %edi, %edi
; X64-NEXT: cmovnsl %edi, %ecx
-; X64-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
+; X64-NEXT: shrl $31, %ecx
+; X64-NEXT: shll $31, %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: addl %edi, %ecx
; X64-NEXT: sete %al
diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
index c8de34f63dd85d..90d8bcea9a118b 100644
--- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
@@ -175,29 +175,33 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; SSE-LABEL: dont_fold_srem_power_of_two:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pextrw $1, %xmm0, %eax
; SSE-NEXT: leal 31(%rax), %ecx
; SSE-NEXT: testw %ax, %ax
; SSE-NEXT: cmovnsl %eax, %ecx
-; SSE-NEXT: andl $-32, %ecx
+; SSE-NEXT: movzwl %cx, %ecx
+; SSE-NEXT: shrl $5, %ecx
+; SSE-NEXT: shlw $5, %cx
; SSE-NEXT: subl %ecx, %eax
; SSE-NEXT: movd %xmm0, %ecx
; SSE-NEXT: leal 63(%rcx), %edx
; SSE-NEXT: testw %cx, %cx
; SSE-NEXT: cmovnsl %ecx, %edx
-; SSE-NEXT: andl $-64, %edx
+; SSE-NEXT: shrl $6, %edx
+; SSE-NEXT: shll $6, %edx
; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: movd %ecx, %xmm0
-; SSE-NEXT: pinsrw $1, %eax, %xmm0
-; SSE-NEXT: pextrw $2, %xmm1, %eax
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pinsrw $1, %eax, %xmm1
+; SSE-NEXT: pextrw $2, %xmm0, %eax
; SSE-NEXT: leal 7(%rax), %ecx
; SSE-NEXT: testw %ax, %ax
; SSE-NEXT: cmovnsl %eax, %ecx
-; SSE-NEXT: andl $-8, %ecx
+; SSE-NEXT: movzwl %cx, %ecx
+; SSE-NEXT: shrl $3, %ecx
+; SSE-NEXT: shlw $3, %cx
; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm0
-; SSE-NEXT: pextrw $3, %xmm1, %eax
+; SSE-NEXT: pinsrw $2, %eax, %xmm1
+; SSE-NEXT: pextrw $3, %xmm0, %eax
; SSE-NEXT: movswl %ax, %ecx
; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77
; SSE-NEXT: shrl $16, %ecx
@@ -209,7 +213,8 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; SSE-NEXT: addl %ecx, %edx
; SSE-NEXT: imull $95, %edx, %ecx
; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm0
+; SSE-NEXT: pinsrw $3, %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: dont_fold_srem_power_of_two:
@@ -218,13 +223,16 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; AVX-NEXT: leal 31(%rax), %ecx
; AVX-NEXT: testw %ax, %ax
; AVX-NEXT: cmovnsl %eax, %ecx
-; AVX-NEXT: andl $-32, %ecx
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: shrl $5, %ecx
+; AVX-NEXT: shlw $5, %cx
; AVX-NEXT: subl %ecx, %eax
; AVX-NEXT: vmovd %xmm0, %ecx
; AVX-NEXT: leal 63(%rcx), %edx
; AVX-NEXT: testw %cx, %cx
; AVX-NEXT: cmovnsl %ecx, %edx
-; AVX-NEXT: andl $-64, %edx
+; AVX-NEXT: shrl $6, %edx
+; AVX-NEXT: shll $6, %edx
; AVX-NEXT: subl %edx, %ecx
; AVX-NEXT: vmovd %ecx, %xmm1
; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
@@ -232,7 +240,9 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; AVX-NEXT: leal 7(%rax), %ecx
; AVX-NEXT: testw %ax, %ax
; AVX-NEXT: cmovnsl %eax, %ecx
-; AVX-NEXT: andl $-8, %ecx
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: shlw $3, %cx
; AVX-NEXT: subl %ecx, %eax
; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX-NEXT: vpextrw $3, %xmm0, %eax
@@ -343,29 +353,31 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; SSE-LABEL: dont_fold_urem_i16_smax:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movswl %ax, %ecx
-; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: addl %eax, %ecx
-; SSE-NEXT: movzwl %cx, %ecx
-; SSE-NEXT: movswl %cx, %edx
-; SSE-NEXT: shrl $15, %ecx
-; SSE-NEXT: sarl $4, %edx
+; SSE-NEXT: pextrw $2, %xmm0, %ecx
+; SSE-NEXT: movswl %cx, %eax
+; SSE-NEXT: imull $-19945, %eax, %eax # imm = 0xB217
+; SSE-NEXT: shrl $16, %eax
+; SSE-NEXT: addl %ecx, %eax
+; SSE-NEXT: movzwl %ax, %edx
+; SSE-NEXT: movswl %dx, %eax
+; SSE-NEXT: shrl $15, %edx
+; SSE-NEXT: sarl $4, %eax
+; SSE-NEXT: addl %edx, %eax
+; SSE-NEXT: leal (%rax,%rax,2), %edx
+; SSE-NEXT: shll $3, %edx
+; SSE-NEXT: subl %edx, %eax
+; SSE-NEXT: addl %ecx, %eax
+; SSE-NEXT: pextrw $1, %xmm0, %ecx
+; SSE-NEXT: leal 32767(%rcx), %edx
+; SSE-NEXT: testw %cx, %cx
+; SSE-NEXT: cmovnsl %ecx, %edx
+; SSE-NEXT: movzwl %dx, %edx
+; SSE-NEXT: shrl $15, %edx
+; SSE-NEXT: shlw $15, %dx
; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
-; SSE-NEXT: shll $3, %ecx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: addl %eax, %edx
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: leal 32767(%rax), %ecx
-; SSE-NEXT: testw %ax, %ax
-; SSE-NEXT: cmovnsl %eax, %ecx
-; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000
-; SSE-NEXT: addl %eax, %ecx
; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pinsrw $1, %ecx, %xmm1
-; SSE-NEXT: pinsrw $2, %edx, %xmm1
+; SSE-NEXT: pinsrw $1, %edx, %xmm1
+; SSE-NEXT: pinsrw $2, %eax, %xmm1
; SSE-NEXT: pextrw $3, %xmm0, %eax
; SSE-NEXT: movswl %ax, %ecx
; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
@@ -381,29 +393,31 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
;
; AVX-LABEL: dont_fold_urem_i16_smax:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: movswl %ax, %ecx
-; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: movswl %cx, %edx
-; AVX-NEXT: shrl $15, %ecx
-; AVX-NEXT: sarl $4, %edx
+; AVX-NEXT: vpextrw $2, %xmm0, %ecx
+; AVX-NEXT: movswl %cx, %eax
+; AVX-NEXT: imull $-19945, %eax, %eax # imm = 0xB217
+; AVX-NEXT: shrl $16, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movzwl %ax, %edx
+; AVX-NEXT: movswl %dx, %eax
+; AVX-NEXT: shrl $15, %edx
+; AVX-NEXT: sarl $4, %eax
+; AVX-NEXT: addl %edx, %eax
+; AVX-NEXT: leal (%rax,%rax,2), %edx
+; AVX-NEXT: shll $3, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpextrw $1, %xmm0, %ecx
+; AVX-NEXT: leal 32767(%rcx), %edx
+; AVX-NEXT: testw %cx, %cx
+; AVX-NEXT: cmovnsl %ecx, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: shrl $15, %edx
+; AVX-NEXT: shlw $15, %dx
; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
-; AVX-NEXT: shll $3, %ecx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: addl %eax, %edx
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: leal 32767(%rax), %ecx
-; AVX-NEXT: testw %ax, %ax
-; AVX-NEXT: cmovnsl %eax, %ecx
-; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000
-; AVX-NEXT: addl %eax, %ecx
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
+; AVX-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1
+; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX-NEXT: vpextrw $3, %xmm0, %eax
; AVX-NEXT: movswl %ax, %ecx
; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
diff --git a/llvm/test/CodeGen/X86/sse2-vector-shifts.ll b/llvm/test/CodeGen/X86/sse2-vector-shifts.ll
index 0a7cd3392e66b3..1a38bddace8c30 100644
--- a/llvm/test/CodeGen/X86/sse2-vector-shifts.ll
+++ b/llvm/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -264,7 +264,8 @@ define <4 x i32> @srl_srl_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @srl_shl_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: srl_shl_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: pslld $4, %xmm0
+; CHECK-NEXT: psrld $4, %xmm0
; CHECK-NEXT: retq
%srl0 = shl <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
%srl1 = lshr <4 x i32> %srl0, <i32 4, i32 4, i32 4, i32 4>
@@ -294,7 +295,8 @@ define <4 x i32> @shl_shl_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @shl_sra_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: shl_sra_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: psrld $4, %xmm0
+; CHECK-NEXT: pslld $4, %xmm0
; CHECK-NEXT: retq
%shl0 = ashr <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
%shl1 = shl <4 x i32> %shl0, <i32 4, i32 4, i32 4, i32 4>
@@ -321,9 +323,10 @@ define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {
; CHECK-LABEL: shl_zext_srl_v4i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: psrlw $2, %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: pslld $2, %xmm0
; CHECK-NEXT: retq
%srl = lshr <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
%zext = zext <4 x i16> %srl to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll
index e5ea911d4771a8..04980b58e9236b 100644
--- a/llvm/test/CodeGen/X86/sshl_sat.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat.ll
@@ -57,19 +57,19 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: movsbl %dil, %eax
-; X64-NEXT: addl %eax, %eax
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: testw %ax, %ax
-; X64-NEXT: sets %dl
-; X64-NEXT: addl $32767, %edx # imm = 0x7FFF
-; X64-NEXT: movl %eax, %esi
-; X64-NEXT: shll %cl, %esi
-; X64-NEXT: movswl %si, %edi
+; X64-NEXT: addw %ax, %ax
+; X64-NEXT: movl %eax, %edx
+; X64-NEXT: shll %cl, %edx
+; X64-NEXT: movswl %dx, %esi
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: sarl %cl, %edi
-; X64-NEXT: cmpw %di, %ax
-; X64-NEXT: cmovnel %edx, %esi
-; X64-NEXT: movswl %si, %eax
+; X64-NEXT: sarl %cl, %esi
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: testw %ax, %ax
+; X64-NEXT: sets %cl
+; X64-NEXT: addl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT: cmpw %si, %ax
+; X64-NEXT: cmovel %edx, %ecx
+; X64-NEXT: movswl %cx, %eax
; X64-NEXT: shrl %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
@@ -77,9 +77,9 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X86-LABEL: func2:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: addl %eax, %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: addw %ax, %ax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: shll %cl, %edx
; X86-NEXT: movswl %dx, %esi
@@ -127,9 +127,9 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
; X86-LABEL: func3:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shll $7, %ecx
+; X86-NEXT: shlw $7, %cx
; X86-NEXT: addl %eax, %eax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: shll %cl, %edx
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 64aead70415759..5c399ee4e81033 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -499,76 +499,129 @@ define void @v1i16(ptr %px, ptr %py, ptr %pz) nounwind {
define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; SSE-LABEL: v16i4:
; SSE: # %bb.0:
+; SSE-NEXT: psllw $4, %xmm0
+; SSE-NEXT: psrlw $4, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [3855,3855,3855,3855,3855,3855,3855,3855]
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: psubb %xmm2, %xmm0
+; SSE-NEXT: psllw $4, %xmm1
+; SSE-NEXT: psrlw $4, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: psubb %xmm2, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; SSE-NEXT: psllw $4, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: psllw $4, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm0
; SSE-NEXT: psubsb %xmm1, %xmm0
; SSE-NEXT: psrlw $4, %xmm0
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: psubb %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: v16i4:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3855,3855,3855,3855,3855,3855,3855,3855]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i4:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v16i4:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v16i4:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpternlogq $108, %xmm2, %xmm3, %xmm0
+; AVX512BW-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX512BW-NEXT: vpternlogq $108, %xmm2, %xmm3, %xmm1
+; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm0
+; AVX512BW-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: retq
%z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
ret <16 x i4> %z
diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll
index 870912bb6bb1be..d8dfde8b5a76c0 100644
--- a/llvm/test/CodeGen/X86/sttni.ll
+++ b/llvm/test/CodeGen/X86/sttni.ll
@@ -315,11 +315,10 @@ define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs,
; X86-NEXT: jmp .LBB8_3
; X86-NEXT: .LBB8_2: # %compare
; X86-NEXT: movdqa %xmm0, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: .LBB8_3: # %exit
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: movl %ebp, %esp
@@ -452,11 +451,10 @@ define i32 @pcmpestri_mem_diff_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32
; X86-NEXT: jmp .LBB11_3
; X86-NEXT: .LBB11_2: # %compare
; X86-NEXT: movdqa %xmm1, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: .LBB11_3: # %exit
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: leal -4(%ebp), %esp
@@ -772,11 +770,10 @@ define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $48, %esp
; X86-NEXT: movdqa %xmm0, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X86-NEXT: movzwl %ax, %eax
@@ -889,11 +886,10 @@ define i32 @pcmpistri_mem_diff_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
; X86-NEXT: jmp .LBB23_3
; X86-NEXT: .LBB23_2: # %compare
; X86-NEXT: movdqa %xmm1, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: .LBB23_3: # %exit
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index f1fd05565c47e9..3498ae8a72a372 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -2384,13 +2384,17 @@ define void @vec384_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
; SCALAR-LABEL: vec384_v3i8:
; SCALAR: # %bb.0:
-; SCALAR-NEXT: movl (%rdi), %eax
-; SCALAR-NEXT: movl %eax, %ecx
+; SCALAR-NEXT: movl (%rdi), %ecx
+; SCALAR-NEXT: movl %ecx, %edi
+; SCALAR-NEXT: movzbl %cl, %eax
; SCALAR-NEXT: shrl $16, %ecx
-; SCALAR-NEXT: notb %cl
+; SCALAR-NEXT: shrl $8, %edi
+; SCALAR-NEXT: shlw $8, %di
+; SCALAR-NEXT: orl %edi, %eax
; SCALAR-NEXT: notl %eax
-; SCALAR-NEXT: movw %ax, (%rsi)
+; SCALAR-NEXT: notb %cl
; SCALAR-NEXT: movb %cl, 2(%rsi)
+; SCALAR-NEXT: movw %ax, (%rsi)
; SCALAR-NEXT: movb %cl, 2(%rdx)
; SCALAR-NEXT: movw %ax, (%rdx)
; SCALAR-NEXT: movb %cl, 6(%rdx)
@@ -2801,28 +2805,32 @@ define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-LABEL: vec384_v3i16:
; SCALAR: # %bb.0:
; SCALAR-NEXT: movq (%rdi), %rax
-; SCALAR-NEXT: movq %rax, %rcx
-; SCALAR-NEXT: shrq $32, %rcx
-; SCALAR-NEXT: notl %ecx
+; SCALAR-NEXT: movl %eax, %ecx
+; SCALAR-NEXT: movzwl %ax, %edi
+; SCALAR-NEXT: shrq $32, %rax
+; SCALAR-NEXT: shrl $16, %ecx
+; SCALAR-NEXT: shll $16, %ecx
+; SCALAR-NEXT: orl %ecx, %edi
+; SCALAR-NEXT: notl %edi
; SCALAR-NEXT: notl %eax
-; SCALAR-NEXT: movl %eax, (%rsi)
-; SCALAR-NEXT: movw %cx, 4(%rsi)
-; SCALAR-NEXT: movw %cx, 4(%rdx)
-; SCALAR-NEXT: movl %eax, (%rdx)
-; SCALAR-NEXT: movw %cx, 12(%rdx)
-; SCALAR-NEXT: movl %eax, 8(%rdx)
-; SCALAR-NEXT: movw %cx, 20(%rdx)
-; SCALAR-NEXT: movl %eax, 16(%rdx)
-; SCALAR-NEXT: movw %cx, 28(%rdx)
-; SCALAR-NEXT: movl %eax, 24(%rdx)
-; SCALAR-NEXT: movw %cx, 36(%rdx)
-; SCALAR-NEXT: movl %eax, 32(%rdx)
-; SCALAR-NEXT: movw %cx, 44(%rdx)
-; SCALAR-NEXT: movl %eax, 40(%rdx)
-; SCALAR-NEXT: movw %cx, 52(%rdx)
-; SCALAR-NEXT: movl %eax, 48(%rdx)
-; SCALAR-NEXT: movw %cx, 60(%rdx)
-; SCALAR-NEXT: movl %eax, 56(%rdx)
+; SCALAR-NEXT: movw %ax, 4(%rsi)
+; SCALAR-NEXT: movl %edi, (%rsi)
+; SCALAR-NEXT: movw %ax, 4(%rdx)
+; SCALAR-NEXT: movl %edi, (%rdx)
+; SCALAR-NEXT: movw %ax, 12(%rdx)
+; SCALAR-NEXT: movl %edi, 8(%rdx)
+; SCALAR-NEXT: movw %ax, 20(%rdx)
+; SCALAR-NEXT: movl %edi, 16(%rdx)
+; SCALAR-NEXT: movw %ax, 28(%rdx)
+; SCALAR-NEXT: movl %edi, 24(%rdx)
+; SCALAR-NEXT: movw %ax, 36(%rdx)
+; SCALAR-NEXT: movl %edi, 32(%rdx)
+; SCALAR-NEXT: movw %ax, 44(%rdx)
+; SCALAR-NEXT: movl %edi, 40(%rdx)
+; SCALAR-NEXT: movw %ax, 52(%rdx)
+; SCALAR-NEXT: movl %edi, 48(%rdx)
+; SCALAR-NEXT: movw %ax, 60(%rdx)
+; SCALAR-NEXT: movl %edi, 56(%rdx)
; SCALAR-NEXT: retq
;
; SSE2-ONLY-LABEL: vec384_v3i16:
@@ -3053,18 +3061,22 @@ define void @vec384_v3i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR: # %bb.0:
; SCALAR-NEXT: movl 8(%rdi), %eax
; SCALAR-NEXT: movq (%rdi), %rcx
-; SCALAR-NEXT: notq %rcx
+; SCALAR-NEXT: movl %ecx, %edi
+; SCALAR-NEXT: shrq $32, %rcx
+; SCALAR-NEXT: shlq $32, %rcx
+; SCALAR-NEXT: orq %rcx, %rdi
+; SCALAR-NEXT: notq %rdi
; SCALAR-NEXT: notl %eax
; SCALAR-NEXT: movl %eax, 8(%rsi)
-; SCALAR-NEXT: movq %rcx, (%rsi)
+; SCALAR-NEXT: movq %rdi, (%rsi)
; SCALAR-NEXT: movl %eax, 8(%rdx)
-; SCALAR-NEXT: movq %rcx, (%rdx)
+; SCALAR-NEXT: movq %rdi, (%rdx)
; SCALAR-NEXT: movl %eax, 24(%rdx)
-; SCALAR-NEXT: movq %rcx, 16(%rdx)
+; SCALAR-NEXT: movq %rdi, 16(%rdx)
; SCALAR-NEXT: movl %eax, 40(%rdx)
-; SCALAR-NEXT: movq %rcx, 32(%rdx)
+; SCALAR-NEXT: movq %rdi, 32(%rdx)
; SCALAR-NEXT: movl %eax, 56(%rdx)
-; SCALAR-NEXT: movq %rcx, 48(%rdx)
+; SCALAR-NEXT: movq %rdi, 48(%rdx)
; SCALAR-NEXT: retq
;
; SSE2-ONLY-LABEL: vec384_v3i32:
@@ -3184,18 +3196,22 @@ define void @vec384_v3f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR: # %bb.0:
; SCALAR-NEXT: movl 8(%rdi), %eax
; SCALAR-NEXT: movq (%rdi), %rcx
-; SCALAR-NEXT: notq %rcx
+; SCALAR-NEXT: movl %ecx, %edi
+; SCALAR-NEXT: shrq $32, %rcx
+; SCALAR-NEXT: shlq $32, %rcx
+; SCALAR-NEXT: orq %rcx, %rdi
+; SCALAR-NEXT: notq %rdi
; SCALAR-NEXT: notl %eax
; SCALAR-NEXT: movl %eax, 8(%rsi)
-; SCALAR-NEXT: movq %rcx, (%rsi)
+; SCALAR-NEXT: movq %rdi, (%rsi)
; SCALAR-NEXT: movl %eax, 8(%rdx)
-; SCALAR-NEXT: movq %rcx, (%rdx)
+; SCALAR-NEXT: movq %rdi, (%rdx)
; SCALAR-NEXT: movl %eax, 24(%rdx)
-; SCALAR-NEXT: movq %rcx, 16(%rdx)
+; SCALAR-NEXT: movq %rdi, 16(%rdx)
; SCALAR-NEXT: movl %eax, 40(%rdx)
-; SCALAR-NEXT: movq %rcx, 32(%rdx)
+; SCALAR-NEXT: movq %rdi, 32(%rdx)
; SCALAR-NEXT: movl %eax, 56(%rdx)
-; SCALAR-NEXT: movq %rcx, 48(%rdx)
+; SCALAR-NEXT: movq %rdi, 48(%rdx)
; SCALAR-NEXT: retq
;
; SSE2-ONLY-LABEL: vec384_v3f32:
@@ -3777,13 +3793,32 @@ define void @vec384_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
; SCALAR-LABEL: vec384_v6i8:
; SCALAR: # %bb.0:
-; SCALAR-NEXT: movq (%rdi), %rax
-; SCALAR-NEXT: movq %rax, %rcx
+; SCALAR-NEXT: movq (%rdi), %rcx
+; SCALAR-NEXT: movq %rcx, %rax
+; SCALAR-NEXT: movl %ecx, %edi
+; SCALAR-NEXT: movl %ecx, %r8d
+; SCALAR-NEXT: movl %ecx, %r9d
+; SCALAR-NEXT: movzbl %cl, %r10d
; SCALAR-NEXT: shrq $32, %rcx
+; SCALAR-NEXT: shrq $40, %rax
+; SCALAR-NEXT: shrl $16, %edi
+; SCALAR-NEXT: shrl $24, %r8d
+; SCALAR-NEXT: shrl $8, %r9d
+; SCALAR-NEXT: shlw $8, %r9w
+; SCALAR-NEXT: orl %r9d, %r10d
+; SCALAR-NEXT: shlw $8, %r8w
+; SCALAR-NEXT: movzbl %dil, %edi
+; SCALAR-NEXT: orl %r8d, %edi
+; SCALAR-NEXT: shll $8, %eax
+; SCALAR-NEXT: movzbl %cl, %ecx
+; SCALAR-NEXT: orl %eax, %ecx
; SCALAR-NEXT: notl %ecx
+; SCALAR-NEXT: movw %cx, 4(%rsi)
+; SCALAR-NEXT: shll $16, %edi
+; SCALAR-NEXT: movzwl %r10w, %eax
+; SCALAR-NEXT: orl %edi, %eax
; SCALAR-NEXT: notl %eax
; SCALAR-NEXT: movl %eax, (%rsi)
-; SCALAR-NEXT: movw %cx, 4(%rsi)
; SCALAR-NEXT: movw %cx, 4(%rdx)
; SCALAR-NEXT: movl %eax, (%rdx)
; SCALAR-NEXT: movw %cx, 12(%rdx)
@@ -4028,20 +4063,38 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
define void @vec384_v6i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
; SCALAR-LABEL: vec384_v6i16:
; SCALAR: # %bb.0:
-; SCALAR-NEXT: movq (%rdi), %rax
; SCALAR-NEXT: movl 8(%rdi), %ecx
-; SCALAR-NEXT: notl %ecx
-; SCALAR-NEXT: notq %rax
-; SCALAR-NEXT: movq %rax, (%rsi)
-; SCALAR-NEXT: movl %ecx, 8(%rsi)
-; SCALAR-NEXT: movl %ecx, 8(%rdx)
-; SCALAR-NEXT: movq %rax, (%rdx)
-; SCALAR-NEXT: movl %ecx, 24(%rdx)
-; SCALAR-NEXT: movq %rax, 16(%rdx)
-; SCALAR-NEXT: movl %ecx, 40(%rdx)
-; SCALAR-NEXT: movq %rax, 32(%rdx)
-; SCALAR-NEXT: movl %ecx, 56(%rdx)
-; SCALAR-NEXT: movq %rax, 48(%rdx)
+; SCALAR-NEXT: movzwl %cx, %eax
+; SCALAR-NEXT: shrl $16, %ecx
+; SCALAR-NEXT: movq (%rdi), %rdi
+; SCALAR-NEXT: movq %rdi, %r8
+; SCALAR-NEXT: movq %rdi, %r9
+; SCALAR-NEXT: movzwl %di, %r10d
+; SCALAR-NEXT: # kill: def $edi killed $edi killed $rdi
+; SCALAR-NEXT: shrl $16, %edi
+; SCALAR-NEXT: shrq $32, %r8
+; SCALAR-NEXT: shrq $48, %r9
+; SCALAR-NEXT: shll $16, %r9d
+; SCALAR-NEXT: movzwl %r8w, %r8d
+; SCALAR-NEXT: orl %r9d, %r8d
+; SCALAR-NEXT: shll $16, %edi
+; SCALAR-NEXT: orl %edi, %r10d
+; SCALAR-NEXT: shll $16, %ecx
+; SCALAR-NEXT: orl %ecx, %eax
+; SCALAR-NEXT: notl %eax
+; SCALAR-NEXT: movl %eax, 8(%rsi)
+; SCALAR-NEXT: shlq $32, %r8
+; SCALAR-NEXT: orq %r10, %r8
+; SCALAR-NEXT: notq %r8
+; SCALAR-NEXT: movq %r8, (%rsi)
+; SCALAR-NEXT: movl %eax, 8(%rdx)
+; SCALAR-NEXT: movq %r8, (%rdx)
+; SCALAR-NEXT: movl %eax, 24(%rdx)
+; SCALAR-NEXT: movq %r8, 16(%rdx)
+; SCALAR-NEXT: movl %eax, 40(%rdx)
+; SCALAR-NEXT: movq %r8, 32(%rdx)
+; SCALAR-NEXT: movl %eax, 56(%rdx)
+; SCALAR-NEXT: movq %r8, 48(%rdx)
; SCALAR-NEXT: retq
;
; SSE2-ONLY-LABEL: vec384_v6i16:
@@ -4159,21 +4212,33 @@ define void @vec384_v6i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
; SCALAR-LABEL: vec384_v6i32:
; SCALAR: # %bb.0:
-; SCALAR-NEXT: movq (%rdi), %rax
-; SCALAR-NEXT: movq 8(%rdi), %rcx
+; SCALAR-NEXT: movq (%rdi), %rcx
+; SCALAR-NEXT: movq 8(%rdi), %rax
+; SCALAR-NEXT: movl %ecx, %r8d
+; SCALAR-NEXT: shrq $32, %rcx
+; SCALAR-NEXT: movl %eax, %r9d
+; SCALAR-NEXT: shrq $32, %rax
; SCALAR-NEXT: movq 16(%rdi), %rdi
-; SCALAR-NEXT: notq %rdi
-; SCALAR-NEXT: notq %rcx
-; SCALAR-NEXT: notq %rax
-; SCALAR-NEXT: movq %rax, (%rsi)
-; SCALAR-NEXT: movq %rcx, 8(%rsi)
-; SCALAR-NEXT: movq %rdi, 16(%rsi)
-; SCALAR-NEXT: movq %rax, (%rdx)
-; SCALAR-NEXT: movq %rcx, 8(%rdx)
-; SCALAR-NEXT: movq %rdi, 16(%rdx)
-; SCALAR-NEXT: movq %rdi, 48(%rdx)
-; SCALAR-NEXT: movq %rcx, 40(%rdx)
-; SCALAR-NEXT: movq %rax, 32(%rdx)
+; SCALAR-NEXT: movl %edi, %r10d
+; SCALAR-NEXT: shrq $32, %rdi
+; SCALAR-NEXT: shlq $32, %rdi
+; SCALAR-NEXT: orq %rdi, %r10
+; SCALAR-NEXT: notq %r10
+; SCALAR-NEXT: shlq $32, %rax
+; SCALAR-NEXT: orq %rax, %r9
+; SCALAR-NEXT: notq %r9
+; SCALAR-NEXT: shlq $32, %rcx
+; SCALAR-NEXT: orq %rcx, %r8
+; SCALAR-NEXT: notq %r8
+; SCALAR-NEXT: movq %r8, (%rsi)
+; SCALAR-NEXT: movq %r9, 8(%rsi)
+; SCALAR-NEXT: movq %r10, 16(%rsi)
+; SCALAR-NEXT: movq %r8, (%rdx)
+; SCALAR-NEXT: movq %r9, 8(%rdx)
+; SCALAR-NEXT: movq %r10, 16(%rdx)
+; SCALAR-NEXT: movq %r10, 48(%rdx)
+; SCALAR-NEXT: movq %r9, 40(%rdx)
+; SCALAR-NEXT: movq %r8, 32(%rdx)
; SCALAR-NEXT: retq
;
; SSE2-LABEL: vec384_v6i32:
@@ -4231,21 +4296,33 @@ define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
; SCALAR-LABEL: vec384_v6f32:
; SCALAR: # %bb.0:
-; SCALAR-NEXT: movq (%rdi), %rax
-; SCALAR-NEXT: movq 8(%rdi), %rcx
+; SCALAR-NEXT: movq (%rdi), %rcx
+; SCALAR-NEXT: movq 8(%rdi), %rax
+; SCALAR-NEXT: movl %ecx, %r8d
+; SCALAR-NEXT: shrq $32, %rcx
+; SCALAR-NEXT: movl %eax, %r9d
+; SCALAR-NEXT: shrq $32, %rax
; SCALAR-NEXT: movq 16(%rdi), %rdi
-; SCALAR-NEXT: notq %rdi
-; SCALAR-NEXT: notq %rcx
-; SCALAR-NEXT: notq %rax
-; SCALAR-NEXT: movq %rax, (%rsi)
-; SCALAR-NEXT: movq %rcx, 8(%rsi)
-; SCALAR-NEXT: movq %rdi, 16(%rsi)
-; SCALAR-NEXT: movq %rax, (%rdx)
-; SCALAR-NEXT: movq %rcx, 8(%rdx)
-; SCALAR-NEXT: movq %rdi, 16(%rdx)
-; SCALAR-NEXT: movq %rdi, 48(%rdx)
-; SCALAR-NEXT: movq %rcx, 40(%rdx)
-; SCALAR-NEXT: movq %rax, 32(%rdx)
+; SCALAR-NEXT: movl %edi, %r10d
+; SCALAR-NEXT: shrq $32, %rdi
+; SCALAR-NEXT: shlq $32, %rdi
+; SCALAR-NEXT: orq %rdi, %r10
+; SCALAR-NEXT: notq %r10
+; SCALAR-NEXT: shlq $32, %rax
+; SCALAR-NEXT: orq %rax, %r9
+; SCALAR-NEXT: notq %r9
+; SCALAR-NEXT: shlq $32, %rcx
+; SCALAR-NEXT: orq %rcx, %r8
+; SCALAR-NEXT: notq %r8
+; SCALAR-NEXT: movq %r8, (%rsi)
+; SCALAR-NEXT: movq %r9, 8(%rsi)
+; SCALAR-NEXT: movq %r10, 16(%rsi)
+; SCALAR-NEXT: movq %r8, (%rdx)
+; SCALAR-NEXT: movq %r9, 8(%rdx)
+; SCALAR-NEXT: movq %r10, 16(%rdx)
+; SCALAR-NEXT: movq %r10, 48(%rdx)
+; SCALAR-NEXT: movq %r9, 40(%rdx)
+; SCALAR-NEXT: movq %r8, 32(%rdx)
; SCALAR-NEXT: retq
;
; SSE2-LABEL: vec384_v6f32:
@@ -4534,20 +4611,80 @@ define void @vec384_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
define void @vec384_v12i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
; SCALAR-LABEL: vec384_v12i8:
; SCALAR: # %bb.0:
-; SCALAR-NEXT: movq (%rdi), %rax
-; SCALAR-NEXT: movl 8(%rdi), %ecx
+; SCALAR-NEXT: pushq %rbp
+; SCALAR-NEXT: pushq %r15
+; SCALAR-NEXT: pushq %r14
+; SCALAR-NEXT: pushq %r13
+; SCALAR-NEXT: pushq %r12
+; SCALAR-NEXT: pushq %rbx
+; SCALAR-NEXT: movl 8(%rdi), %r8d
+; SCALAR-NEXT: movl %r8d, %eax
+; SCALAR-NEXT: movl %r8d, %ecx
+; SCALAR-NEXT: movzbl %r8b, %r9d
+; SCALAR-NEXT: shrl $8, %r8d
+; SCALAR-NEXT: shrl $16, %eax
+; SCALAR-NEXT: shrl $24, %ecx
+; SCALAR-NEXT: movq (%rdi), %rdi
+; SCALAR-NEXT: movq %rdi, %r10
+; SCALAR-NEXT: movq %rdi, %r11
+; SCALAR-NEXT: movq %rdi, %rbx
+; SCALAR-NEXT: movl %edi, %ebp
+; SCALAR-NEXT: movl %edi, %r14d
+; SCALAR-NEXT: movl %edi, %r15d
+; SCALAR-NEXT: movzbl %dil, %r12d
+; SCALAR-NEXT: movq %rdi, %r13
+; SCALAR-NEXT: shrq $32, %r13
+; SCALAR-NEXT: shrq $40, %r10
+; SCALAR-NEXT: shrq $48, %r11
+; SCALAR-NEXT: shrq $56, %rbx
+; SCALAR-NEXT: shrl $8, %ebp
+; SCALAR-NEXT: shrl $16, %r14d
+; SCALAR-NEXT: shrl $24, %r15d
+; SCALAR-NEXT: shlw $8, %r15w
+; SCALAR-NEXT: movzbl %r14b, %r14d
+; SCALAR-NEXT: orl %r15d, %r14d
+; SCALAR-NEXT: shll $16, %r14d
+; SCALAR-NEXT: shlw $8, %bp
+; SCALAR-NEXT: orl %ebp, %r12d
+; SCALAR-NEXT: movzwl %r12w, %edi
+; SCALAR-NEXT: orl %r14d, %edi
+; SCALAR-NEXT: shlw $8, %bx
+; SCALAR-NEXT: movzbl %r11b, %r11d
+; SCALAR-NEXT: orl %ebx, %r11d
+; SCALAR-NEXT: shll $16, %r11d
+; SCALAR-NEXT: shll $8, %r10d
+; SCALAR-NEXT: movzbl %r13b, %ebx
+; SCALAR-NEXT: orl %r10d, %ebx
+; SCALAR-NEXT: movzwl %bx, %r10d
+; SCALAR-NEXT: orl %r11d, %r10d
+; SCALAR-NEXT: shlw $8, %cx
+; SCALAR-NEXT: movzbl %al, %eax
+; SCALAR-NEXT: orl %ecx, %eax
+; SCALAR-NEXT: shll $16, %eax
+; SCALAR-NEXT: shlw $8, %r8w
+; SCALAR-NEXT: orl %r8d, %r9d
+; SCALAR-NEXT: movzwl %r9w, %ecx
+; SCALAR-NEXT: orl %eax, %ecx
; SCALAR-NEXT: notl %ecx
-; SCALAR-NEXT: notq %rax
-; SCALAR-NEXT: movq %rax, (%rsi)
; SCALAR-NEXT: movl %ecx, 8(%rsi)
+; SCALAR-NEXT: shlq $32, %r10
+; SCALAR-NEXT: orq %r10, %rdi
+; SCALAR-NEXT: notq %rdi
+; SCALAR-NEXT: movq %rdi, (%rsi)
; SCALAR-NEXT: movl %ecx, 8(%rdx)
-; SCALAR-NEXT: movq %rax, (%rdx)
+; SCALAR-NEXT: movq %rdi, (%rdx)
; SCALAR-NEXT: movl %ecx, 24(%rdx)
-; SCALAR-NEXT: movq %rax, 16(%rdx)
+; SCALAR-NEXT: movq %rdi, 16(%rdx)
; SCALAR-NEXT: movl %ecx, 40(%rdx)
-; SCALAR-NEXT: movq %rax, 32(%rdx)
+; SCALAR-NEXT: movq %rdi, 32(%rdx)
; SCALAR-NEXT: movl %ecx, 56(%rdx)
-; SCALAR-NEXT: movq %rax, 48(%rdx)
+; SCALAR-NEXT: movq %rdi, 48(%rdx)
+; SCALAR-NEXT: popq %rbx
+; SCALAR-NEXT: popq %r12
+; SCALAR-NEXT: popq %r13
+; SCALAR-NEXT: popq %r14
+; SCALAR-NEXT: popq %r15
+; SCALAR-NEXT: popq %rbp
; SCALAR-NEXT: retq
;
; SSE2-ONLY-LABEL: vec384_v12i8:
@@ -4665,21 +4802,70 @@ define void @vec384_v12i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
define void @vec384_v12i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
; SCALAR-LABEL: vec384_v12i16:
; SCALAR: # %bb.0:
-; SCALAR-NEXT: movq (%rdi), %rax
-; SCALAR-NEXT: movq 8(%rdi), %rcx
+; SCALAR-NEXT: pushq %rbp
+; SCALAR-NEXT: pushq %r15
+; SCALAR-NEXT: pushq %r14
+; SCALAR-NEXT: pushq %r12
+; SCALAR-NEXT: pushq %rbx
+; SCALAR-NEXT: movq (%rdi), %r9
+; SCALAR-NEXT: movq 8(%rdi), %r8
+; SCALAR-NEXT: movq %r9, %rax
+; SCALAR-NEXT: movl %r9d, %ecx
+; SCALAR-NEXT: movzwl %r9w, %r10d
+; SCALAR-NEXT: shrq $32, %r9
+; SCALAR-NEXT: shrq $48, %rax
+; SCALAR-NEXT: shrl $16, %ecx
+; SCALAR-NEXT: movq %r8, %r11
+; SCALAR-NEXT: movl %r8d, %ebx
+; SCALAR-NEXT: movzwl %r8w, %r14d
+; SCALAR-NEXT: shrq $32, %r8
+; SCALAR-NEXT: shrq $48, %r11
+; SCALAR-NEXT: shrl $16, %ebx
; SCALAR-NEXT: movq 16(%rdi), %rdi
+; SCALAR-NEXT: movq %rdi, %r15
+; SCALAR-NEXT: movl %edi, %ebp
+; SCALAR-NEXT: movzwl %di, %r12d
+; SCALAR-NEXT: shrq $32, %rdi
+; SCALAR-NEXT: shrq $48, %r15
+; SCALAR-NEXT: shrl $16, %ebp
+; SCALAR-NEXT: shll $16, %ebp
+; SCALAR-NEXT: orl %ebp, %r12d
+; SCALAR-NEXT: shll $16, %r15d
+; SCALAR-NEXT: movzwl %di, %edi
+; SCALAR-NEXT: orl %r15d, %edi
+; SCALAR-NEXT: shlq $32, %rdi
+; SCALAR-NEXT: orq %r12, %rdi
; SCALAR-NEXT: notq %rdi
+; SCALAR-NEXT: shll $16, %ebx
+; SCALAR-NEXT: orl %ebx, %r14d
+; SCALAR-NEXT: shll $16, %r11d
+; SCALAR-NEXT: movzwl %r8w, %r8d
+; SCALAR-NEXT: orl %r11d, %r8d
+; SCALAR-NEXT: shlq $32, %r8
+; SCALAR-NEXT: orq %r14, %r8
+; SCALAR-NEXT: notq %r8
+; SCALAR-NEXT: shll $16, %ecx
+; SCALAR-NEXT: orl %ecx, %r10d
+; SCALAR-NEXT: shll $16, %eax
+; SCALAR-NEXT: movzwl %r9w, %ecx
+; SCALAR-NEXT: orl %eax, %ecx
+; SCALAR-NEXT: shlq $32, %rcx
+; SCALAR-NEXT: orq %r10, %rcx
; SCALAR-NEXT: notq %rcx
-; SCALAR-NEXT: notq %rax
-; SCALAR-NEXT: movq %rax, (%rsi)
-; SCALAR-NEXT: movq %rcx, 8(%rsi)
+; SCALAR-NEXT: movq %rcx, (%rsi)
+; SCALAR-NEXT: movq %r8, 8(%rsi)
; SCALAR-NEXT: movq %rdi, 16(%rsi)
-; SCALAR-NEXT: movq %rax, (%rdx)
-; SCALAR-NEXT: movq %rcx, 8(%rdx)
+; SCALAR-NEXT: movq %rcx, (%rdx)
+; SCALAR-NEXT: movq %r8, 8(%rdx)
; SCALAR-NEXT: movq %rdi, 16(%rdx)
; SCALAR-NEXT: movq %rdi, 48(%rdx)
-; SCALAR-NEXT: movq %rcx, 40(%rdx)
-; SCALAR-NEXT: movq %rax, 32(%rdx)
+; SCALAR-NEXT: movq %r8, 40(%rdx)
+; SCALAR-NEXT: movq %rcx, 32(%rdx)
+; SCALAR-NEXT: popq %rbx
+; SCALAR-NEXT: popq %r12
+; SCALAR-NEXT: popq %r14
+; SCALAR-NEXT: popq %r15
+; SCALAR-NEXT: popq %rbp
; SCALAR-NEXT: retq
;
; SSE2-LABEL: vec384_v12i16:
@@ -4928,6 +5114,108 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: movq (%rdi), %rax
; SCALAR-NEXT: movq 8(%rdi), %rcx
; SCALAR-NEXT: movq 16(%rdi), %rdi
+; SCALAR-NEXT: movq %rdi, %r8
+; SCALAR-NEXT: shrq $32, %r8
+; SCALAR-NEXT: movq %rdi, %r9
+; SCALAR-NEXT: shrq $48, %r9
+; SCALAR-NEXT: movq %rdi, %r10
+; SCALAR-NEXT: shrq $56, %r10
+; SCALAR-NEXT: shlw $8, %r10w
+; SCALAR-NEXT: movzbl %r9b, %r9d
+; SCALAR-NEXT: orl %r10d, %r9d
+; SCALAR-NEXT: movq %rdi, %r10
+; SCALAR-NEXT: shrq $40, %r10
+; SCALAR-NEXT: shll $8, %r10d
+; SCALAR-NEXT: movzbl %r8b, %r8d
+; SCALAR-NEXT: orl %r10d, %r8d
+; SCALAR-NEXT: movl %edi, %r10d
+; SCALAR-NEXT: shrl $16, %r10d
+; SCALAR-NEXT: shll $16, %r9d
+; SCALAR-NEXT: movzwl %r8w, %r8d
+; SCALAR-NEXT: orl %r9d, %r8d
+; SCALAR-NEXT: movl %edi, %r9d
+; SCALAR-NEXT: shrl $24, %r9d
+; SCALAR-NEXT: shlw $8, %r9w
+; SCALAR-NEXT: movzbl %r10b, %r10d
+; SCALAR-NEXT: orl %r9d, %r10d
+; SCALAR-NEXT: movzbl %dil, %r11d
+; SCALAR-NEXT: # kill: def $edi killed $edi killed $rdi
+; SCALAR-NEXT: shrl $8, %edi
+; SCALAR-NEXT: shlw $8, %di
+; SCALAR-NEXT: orl %edi, %r11d
+; SCALAR-NEXT: movq %rcx, %r9
+; SCALAR-NEXT: shrq $32, %r9
+; SCALAR-NEXT: shll $16, %r10d
+; SCALAR-NEXT: movzwl %r11w, %edi
+; SCALAR-NEXT: orl %r10d, %edi
+; SCALAR-NEXT: movq %rcx, %r10
+; SCALAR-NEXT: shrq $48, %r10
+; SCALAR-NEXT: shlq $32, %r8
+; SCALAR-NEXT: orq %r8, %rdi
+; SCALAR-NEXT: movq %rcx, %r8
+; SCALAR-NEXT: shrq $56, %r8
+; SCALAR-NEXT: shlw $8, %r8w
+; SCALAR-NEXT: movzbl %r10b, %r10d
+; SCALAR-NEXT: orl %r8d, %r10d
+; SCALAR-NEXT: movq %rcx, %r8
+; SCALAR-NEXT: shrq $40, %r8
+; SCALAR-NEXT: shll $8, %r8d
+; SCALAR-NEXT: movzbl %r9b, %r9d
+; SCALAR-NEXT: orl %r8d, %r9d
+; SCALAR-NEXT: movl %ecx, %r11d
+; SCALAR-NEXT: shrl $16, %r11d
+; SCALAR-NEXT: shll $16, %r10d
+; SCALAR-NEXT: movzwl %r9w, %r8d
+; SCALAR-NEXT: orl %r10d, %r8d
+; SCALAR-NEXT: movl %ecx, %r9d
+; SCALAR-NEXT: shrl $24, %r9d
+; SCALAR-NEXT: shlw $8, %r9w
+; SCALAR-NEXT: movzbl %r11b, %r10d
+; SCALAR-NEXT: orl %r9d, %r10d
+; SCALAR-NEXT: movzbl %cl, %r11d
+; SCALAR-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; SCALAR-NEXT: shrl $8, %ecx
+; SCALAR-NEXT: shlw $8, %cx
+; SCALAR-NEXT: orl %ecx, %r11d
+; SCALAR-NEXT: movq %rax, %r9
+; SCALAR-NEXT: shrq $32, %r9
+; SCALAR-NEXT: shll $16, %r10d
+; SCALAR-NEXT: movzwl %r11w, %ecx
+; SCALAR-NEXT: orl %r10d, %ecx
+; SCALAR-NEXT: movq %rax, %r10
+; SCALAR-NEXT: shrq $48, %r10
+; SCALAR-NEXT: shlq $32, %r8
+; SCALAR-NEXT: orq %r8, %rcx
+; SCALAR-NEXT: movq %rax, %r8
+; SCALAR-NEXT: shrq $56, %r8
+; SCALAR-NEXT: shlw $8, %r8w
+; SCALAR-NEXT: movzbl %r10b, %r10d
+; SCALAR-NEXT: orl %r8d, %r10d
+; SCALAR-NEXT: movq %rax, %r8
+; SCALAR-NEXT: shrq $40, %r8
+; SCALAR-NEXT: shll $8, %r8d
+; SCALAR-NEXT: movzbl %r9b, %r9d
+; SCALAR-NEXT: orl %r8d, %r9d
+; SCALAR-NEXT: movl %eax, %r11d
+; SCALAR-NEXT: shrl $16, %r11d
+; SCALAR-NEXT: shll $16, %r10d
+; SCALAR-NEXT: movzwl %r9w, %r8d
+; SCALAR-NEXT: orl %r10d, %r8d
+; SCALAR-NEXT: movl %eax, %r9d
+; SCALAR-NEXT: shrl $24, %r9d
+; SCALAR-NEXT: shlw $8, %r9w
+; SCALAR-NEXT: movzbl %r11b, %r10d
+; SCALAR-NEXT: orl %r9d, %r10d
+; SCALAR-NEXT: movzbl %al, %r9d
+; SCALAR-NEXT: # kill: def $eax killed $eax killed $rax
+; SCALAR-NEXT: shrl $8, %eax
+; SCALAR-NEXT: shlw $8, %ax
+; SCALAR-NEXT: orl %eax, %r9d
+; SCALAR-NEXT: shll $16, %r10d
+; SCALAR-NEXT: movzwl %r9w, %eax
+; SCALAR-NEXT: orl %r10d, %eax
+; SCALAR-NEXT: shlq $32, %r8
+; SCALAR-NEXT: orq %r8, %rax
; SCALAR-NEXT: notq %rdi
; SCALAR-NEXT: notq %rcx
; SCALAR-NEXT: notq %rax
diff --git a/llvm/test/CodeGen/X86/test-vs-bittest.ll b/llvm/test/CodeGen/X86/test-vs-bittest.ll
index 0998a0f1d6ba9d..a76dfb86ec5d1e 100644
--- a/llvm/test/CodeGen/X86/test-vs-bittest.ll
+++ b/llvm/test/CodeGen/X86/test-vs-bittest.ll
@@ -746,8 +746,8 @@ define i32 @setcc_is_bit_set(i32 %x) {
; CHECK-LABEL: setcc_is_bit_set:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: andl $1024, %eax # imm = 0x400
; CHECK-NEXT: shrl $10, %eax
-; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: retq
%a1 = and i32 %x, 1024
%b1 = icmp ne i32 %a1, 0
diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll
index 5b1e0545502b81..2f578d8e5d3cf2 100644
--- a/llvm/test/CodeGen/X86/udiv_fix.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix.ll
@@ -44,8 +44,7 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %ecx
; X64-NEXT: addl %eax, %eax
-; X64-NEXT: cwtl
-; X64-NEXT: shrl %eax
+; X64-NEXT: sarw %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
@@ -59,8 +58,7 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl %ecx
; X86-NEXT: addl %eax, %eax
-; X86-NEXT: cwtl
-; X86-NEXT: shrl %eax
+; X86-NEXT: sarw %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
%x2 = sext i8 %x to i15
@@ -74,33 +72,35 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
; X64-LABEL: func3:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: leal (%rdi,%rdi), %eax
+; X64-NEXT: andl $32767, %edi # imm = 0x7FFF
; X64-NEXT: movzbl %sil, %ecx
-; X64-NEXT: shll $4, %ecx
+; X64-NEXT: shll $7, %ecx
+; X64-NEXT: leal (%rdi,%rdi), %eax
+; X64-NEXT: shrw $3, %cx
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divw %cx
; X64-NEXT: # kill: def $ax killed $ax def $eax
; X64-NEXT: addl %eax, %eax
-; X64-NEXT: cwtl
-; X64-NEXT: shrl %eax
+; X64-NEXT: sarw %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
; X86-LABEL: func3:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: addl %eax, %eax
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: shll $4, %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl $32767, %eax # imm = 0x7FFF
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shlw $7, %cx
+; X86-NEXT: andl $32640, %ecx # imm = 0x7F80
+; X86-NEXT: shrw $3, %cx
+; X86-NEXT: addw %ax, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: addl %eax, %eax
-; X86-NEXT: cwtl
-; X86-NEXT: shrl %eax
+; X86-NEXT: sarw %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
%y2 = sext i8 %y to i15
@@ -213,9 +213,9 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
define i16 @func7(i16 %x, i16 %y) nounwind {
; X64-LABEL: func7:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shll $16, %eax
; X64-NEXT: movzwl %si, %ecx
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: shll $16, %eax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %ecx
; X64-NEXT: # kill: def $ax killed $ax killed $eax
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index 30a7f80b2315d5..3743f848fe2d81 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -14,7 +14,8 @@ define i16 @func(i16 %x, i16 %y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movzwl %si, %ecx
; X64-NEXT: movzwl %di, %eax
-; X64-NEXT: shll $8, %eax
+; X64-NEXT: addl %eax, %eax
+; X64-NEXT: shll $7, %eax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %ecx
; X64-NEXT: cmpl $131071, %eax # imm = 0x1FFFF
@@ -27,9 +28,9 @@ define i16 @func(i16 %x, i16 %y) nounwind {
; X86-LABEL: func:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: shll $8, %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: shll $7, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl %ecx
; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF
@@ -54,10 +55,9 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X64-NEXT: divl %ecx
; X64-NEXT: cmpl $32767, %eax # imm = 0x7FFF
; X64-NEXT: movl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT: cmovbl %eax, %ecx
-; X64-NEXT: addl %ecx, %ecx
-; X64-NEXT: movswl %cx, %eax
-; X64-NEXT: shrl %eax
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: addl %eax, %eax
+; X64-NEXT: sarw %ax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
@@ -72,10 +72,9 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X86-NEXT: divl %ecx
; X86-NEXT: cmpl $32767, %eax # imm = 0x7FFF
; X86-NEXT: movl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT: cmovbl %eax, %ecx
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: movswl %cx, %eax
-; X86-NEXT: shrl %eax
+; X86-NEXT: cmovael %ecx, %eax
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: sarw %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
%x2 = sext i8 %x to i15
@@ -89,9 +88,11 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
; X64-LABEL: func3:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: leal (%rdi,%rdi), %eax
+; X64-NEXT: andl $32767, %edi # imm = 0x7FFF
; X64-NEXT: movzbl %sil, %ecx
-; X64-NEXT: shll $4, %ecx
+; X64-NEXT: shll $7, %ecx
+; X64-NEXT: leal (%rdi,%rdi), %eax
+; X64-NEXT: shrw $3, %cx
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divw %cx
@@ -101,18 +102,19 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
; X64-NEXT: movl $32767, %ecx # imm = 0x7FFF
; X64-NEXT: cmovbl %eax, %ecx
; X64-NEXT: addl %ecx, %ecx
-; X64-NEXT: movswl %cx, %eax
-; X64-NEXT: shrl %eax
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: sarw %cx
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: retq
;
; X86-LABEL: func3:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: addl %eax, %eax
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: shll $4, %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl $32767, %eax # imm = 0x7FFF
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shlw $7, %cx
+; X86-NEXT: andl $32640, %ecx # imm = 0x7F80
+; X86-NEXT: shrw $3, %cx
+; X86-NEXT: addw %ax, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divw %cx
@@ -122,9 +124,8 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
; X86-NEXT: movl $32767, %ecx # imm = 0x7FFF
; X86-NEXT: cmovbl %eax, %ecx
; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: movswl %cx, %eax
-; X86-NEXT: shrl %eax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: sarw %cx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: retl
%y2 = sext i8 %y to i15
%y3 = shl i15 %y2, 7
@@ -283,15 +284,14 @@ define i16 @func7(i16 %x, i16 %y) nounwind {
; X86-LABEL: func7:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shll $17, %edx
-; X86-NEXT: shrl $15, %ecx
-; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: shrl $15, %edx
+; X86-NEXT: shll $17, %eax
; X86-NEXT: pushl $0
-; X86-NEXT: pushl %eax
; X86-NEXT: pushl %ecx
; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %eax
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $16, %esp
; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index ccabb360a990c9..a63e144e85c906 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -603,9 +603,10 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X64-NEXT: movq %r12, 16(%rdi)
; X64-NEXT: movq %rax, 24(%rdi)
; X64-NEXT: movl %esi, 32(%rdi)
-; X64-NEXT: shrq $32, %rsi
-; X64-NEXT: andl $4095, %esi # imm = 0xFFF
-; X64-NEXT: movw %si, 36(%rdi)
+; X64-NEXT: movabsq $17587891077120, %rax # imm = 0xFFF00000000
+; X64-NEXT: andq %rsi, %rax
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movw %ax, 36(%rdi)
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index 8c7078c7263284..c25d2d312a2174 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -321,6 +321,7 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $15, %sil
+; X64-NEXT: andb $15, %al
; X64-NEXT: shlb $4, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: mulb %sil
@@ -336,6 +337,7 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: andb $15, %cl
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andb $15, %al
; X86-NEXT: shlb $4, %al
; X86-NEXT: mulb %cl
; X86-NEXT: movzbl %al, %ecx
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index b4e91da920a2fd..6aa058af1fb548 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -110,9 +110,9 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; X86: # %bb.0:
; X86-NEXT: imull $683, {{[0-9]+}}(%esp), %eax # imm = 0x2AB
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: shll $10, %ecx
-; X86-NEXT: andl $2046, %eax # imm = 0x7FE
-; X86-NEXT: shrl %eax
+; X86-NEXT: andl $2046, %ecx # imm = 0x7FE
+; X86-NEXT: shrw %cx
+; X86-NEXT: shll $10, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: andl $2047, %eax # imm = 0x7FF
; X86-NEXT: cmpl $342, %eax # imm = 0x156
diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll
index e0e1ef7108d0d1..72d6eec2080b85 100644
--- a/llvm/test/CodeGen/X86/ushl_sat.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat.ll
@@ -49,7 +49,7 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: movsbl %dil, %eax
-; X64-NEXT: addl %eax, %eax
+; X64-NEXT: addw %ax, %ax
; X64-NEXT: movl %eax, %edx
; X64-NEXT: shll %cl, %edx
; X64-NEXT: movzwl %dx, %esi
@@ -66,9 +66,9 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X86-LABEL: func2:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: addl %eax, %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: addw %ax, %ax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: shll %cl, %edx
; X86-NEXT: movzwl %dx, %esi
@@ -110,9 +110,9 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
; X86-LABEL: func3:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shll $7, %ecx
+; X86-NEXT: shlw $7, %cx
; X86-NEXT: addl %eax, %eax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: shll %cl, %edx
diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll
index f8bc6b01c70a84..b73ddc3cb7f7f8 100644
--- a/llvm/test/CodeGen/X86/vec_shift5.ll
+++ b/llvm/test/CodeGen/X86/vec_shift5.ll
@@ -294,7 +294,7 @@ define <2 x i8> @PR58661(<2 x i8> %a0) {
; CHECK: # %bb.0:
; CHECK-NEXT: psrlw $8, %xmm0
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: shll $8, %eax
+; CHECK-NEXT: shlw $8, %ax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <2 x i8> %a0, <2 x i8> <i8 poison, i8 0>, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 90cc3d5fdde829..59a3f8ac3c8639 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -81,47 +81,49 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
define i16 @test_bitreverse_i16(i16 %a) nounwind {
; SSE-LABEL: test_bitreverse_i16:
; SSE: # %bb.0:
-; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: rolw $8, %di
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andl $3855, %eax # imm = 0xF0F
-; SSE-NEXT: shll $4, %eax
+; SSE-NEXT: shlw $4, %ax
; SSE-NEXT: shrl $4, %edi
; SSE-NEXT: andl $3855, %edi # imm = 0xF0F
-; SSE-NEXT: orl %eax, %edi
-; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: orl %edi, %eax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $13107, %ecx # imm = 0x3333
+; SSE-NEXT: shlw $2, %cx
+; SSE-NEXT: shrl $2, %eax
; SSE-NEXT: andl $13107, %eax # imm = 0x3333
-; SSE-NEXT: shrl $2, %edi
-; SSE-NEXT: andl $13107, %edi # imm = 0x3333
-; SSE-NEXT: leal (%rdi,%rax,4), %eax
+; SSE-NEXT: orl %ecx, %eax
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: andl $21845, %ecx # imm = 0x5555
+; SSE-NEXT: addw %cx, %cx
; SSE-NEXT: shrl %eax
; SSE-NEXT: andl $21845, %eax # imm = 0x5555
-; SSE-NEXT: leal (%rax,%rcx,2), %eax
+; SSE-NEXT: orl %ecx, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_bitreverse_i16:
; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $edi killed $edi def $rdi
; AVX-NEXT: rolw $8, %di
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andl $3855, %eax # imm = 0xF0F
-; AVX-NEXT: shll $4, %eax
+; AVX-NEXT: shlw $4, %ax
; AVX-NEXT: shrl $4, %edi
; AVX-NEXT: andl $3855, %edi # imm = 0xF0F
-; AVX-NEXT: orl %eax, %edi
-; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: orl %edi, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: andl $13107, %ecx # imm = 0x3333
+; AVX-NEXT: shlw $2, %cx
+; AVX-NEXT: shrl $2, %eax
; AVX-NEXT: andl $13107, %eax # imm = 0x3333
-; AVX-NEXT: shrl $2, %edi
-; AVX-NEXT: andl $13107, %edi # imm = 0x3333
-; AVX-NEXT: leal (%rdi,%rax,4), %eax
+; AVX-NEXT: orl %ecx, %eax
; AVX-NEXT: movl %eax, %ecx
; AVX-NEXT: andl $21845, %ecx # imm = 0x5555
+; AVX-NEXT: addw %cx, %cx
; AVX-NEXT: shrl %eax
; AVX-NEXT: andl $21845, %eax # imm = 0x5555
-; AVX-NEXT: leal (%rax,%rcx,2), %eax
+; AVX-NEXT: orl %ecx, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 85c1e25c29ed5b..61aeaf0c3145e9 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -3722,6 +3722,7 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
; X86-SSE2-NEXT: sarl $15, %edx
; X86-SSE2-NEXT: movd %edx, %xmm0
; X86-SSE2-NEXT: movl %eax, %edx
+; X86-SSE2-NEXT: andl $-4, %edx
; X86-SSE2-NEXT: shll $13, %edx
; X86-SSE2-NEXT: sarl $15, %edx
; X86-SSE2-NEXT: movd %edx, %xmm1
@@ -3755,6 +3756,7 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
; X86-SSE41-NEXT: sarl $15, %eax
; X86-SSE41-NEXT: movd %eax, %xmm0
; X86-SSE41-NEXT: pinsrd $1, %ecx, %xmm0
+; X86-SSE41-NEXT: andl $-4, %esi
; X86-SSE41-NEXT: shll $13, %esi
; X86-SSE41-NEXT: sarl $15, %esi
; X86-SSE41-NEXT: pinsrd $2, %esi, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index 388511ce0741f6..1e739bd631161b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -151,7 +151,7 @@ define void @PR46178(ptr %0) {
define <8 x i32> @PR46393(<8 x i16> %a0, i8 %a1) {
; X86-LABEL: PR46393:
; X86: # %bb.0:
-; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT: vpmovsxwd %xmm0, %ymm0
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovd %eax, %k1
; X86-NEXT: vpslld $16, %ymm0, %ymm0 {%k1} {z}
@@ -159,7 +159,7 @@ define <8 x i32> @PR46393(<8 x i16> %a0, i8 %a1) {
;
; X64-LABEL: PR46393:
; X64: # %bb.0:
-; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpslld $16, %ymm0, %ymm0 {%k1} {z}
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
index c3e9a2b6841ae2..7d4f3c8b3de15e 100644
--- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
@@ -321,25 +321,55 @@ define <16 x i8> @test_128_i8_x_16_28_mask_ashr_1(<16 x i8> %a0) {
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: psrlw $1, %xmm0
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: psubb %xmm1, %xmm0
; X86-SSE2-NEXT: retl
;
-; X86-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_1:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_1:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X86-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_1:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X86-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_ashr_1:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: psrlw $1, %xmm0
; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X64-SSE2-NEXT: por %xmm1, %xmm0
+; X64-SSE2-NEXT: psubb %xmm1, %xmm0
; X64-SSE2-NEXT: retq
;
-; X64-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_1:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_1:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X64-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_1:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X64-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
%t0 = and <16 x i8> %a0, <i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28>
%t1 = ashr <16 x i8> %t0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
ret <16 x i8> %t1
@@ -349,25 +379,55 @@ define <16 x i8> @test_128_i8_x_16_28_mask_ashr_2(<16 x i8> %a0) {
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: psrlw $2, %xmm0
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: psubb %xmm1, %xmm0
; X86-SSE2-NEXT: retl
;
-; X86-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_2:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
-; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_2:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; X86-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_2:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; X86-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_ashr_2:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: psrlw $2, %xmm0
; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; X64-SSE2-NEXT: por %xmm1, %xmm0
+; X64-SSE2-NEXT: psubb %xmm1, %xmm0
; X64-SSE2-NEXT: retq
;
-; X64-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_2:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
-; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_2:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; X64-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_2:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; X64-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
%t0 = and <16 x i8> %a0, <i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28, i8 28>
%t1 = ashr <16 x i8> %t0, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
ret <16 x i8> %t1
@@ -3002,27 +3062,55 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_ashr_1(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_15(<2 x i64> %a0) {
; X86-SSE2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: psrlq $15, %xmm0
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; X86-SSE2-NEXT: psrad $15, %xmm1
+; X86-SSE2-NEXT: psrlq $15, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
-; X86-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vpsrlq $15, %xmm0, %xmm0
-; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrad $15, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpsrlq $15, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsrad $15, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpsrlq $15, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: psrlq $15, %xmm0
; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; X64-SSE2-NEXT: psrad $15, %xmm1
+; X64-SSE2-NEXT: psrlq $15, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
-; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpsrlq $15, %xmm0, %xmm0
-; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrad $15, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpsrlq $15, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsrad $15, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpsrlq $15, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; X64-AVX2-NEXT: retq
%t0 = and <2 x i64> %a0, <i64 140737488289792, i64 140737488289792>
%t1 = ashr <2 x i64> %t0, <i64 15, i64 15>
ret <2 x i64> %t1
@@ -3030,27 +3118,55 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_15(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_16(<2 x i64> %a0) {
; X86-SSE2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: psrlq $16, %xmm0
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; X86-SSE2-NEXT: psrad $16, %xmm1
+; X86-SSE2-NEXT: psrlq $16, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
-; X86-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vpsrlq $16, %xmm0, %xmm0
-; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrad $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpsrlq $16, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsrad $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpsrlq $16, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: psrlq $16, %xmm0
; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; X64-SSE2-NEXT: psrad $16, %xmm1
+; X64-SSE2-NEXT: psrlq $16, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
-; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpsrlq $16, %xmm0, %xmm0
-; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrad $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpsrlq $16, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsrad $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpsrlq $16, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; X64-AVX2-NEXT: retq
%t0 = and <2 x i64> %a0, <i64 140737488289792, i64 140737488289792>
%t1 = ashr <2 x i64> %t0, <i64 16, i64 16>
ret <2 x i64> %t1
@@ -3213,6 +3329,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> %
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %a0) {
; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrad $31, %xmm1
@@ -3222,6 +3339,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %
;
; X86-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X86-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0
@@ -3230,6 +3348,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %
;
; X86-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X86-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0
@@ -3238,6 +3357,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %
;
; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrad $31, %xmm1
@@ -3247,6 +3367,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %
;
; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0
@@ -3255,6 +3376,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %
;
; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 9acd995d612c31..d576c9efe74f32 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -765,18 +765,18 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: andl $1, %eax
-; SSE-NEXT: shll $15, %eax
+; SSE-NEXT: shlq $15, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: vselect_any_extend_vector_inreg_crash:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: shll $15, %eax
+; AVX1-NEXT: shlq $15, %rax
; AVX1-NEXT: retq
;
; AVX2-LABEL: vselect_any_extend_vector_inreg_crash:
@@ -784,9 +784,9 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [49,49,49,49]
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: shll $15, %eax
+; AVX2-NEXT: shlq $15, %rax
; AVX2-NEXT: retq
0:
%1 = load <8 x i8>, ptr %x
diff --git a/llvm/test/CodeGen/X86/x86-64-extend-shift.ll b/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
index ae8d450d1345b9..78ae4b51d2a324 100644
--- a/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
+++ b/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
@@ -6,7 +6,8 @@ define i64 @baz(i32 %A) nounwind {
; CHECK-LABEL: baz:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: shlq $49, %rax
+; CHECK-NEXT: shll $17, %eax
+; CHECK-NEXT: shlq $32, %rax
; CHECK-NEXT: retq
%tmp1 = shl i32 %A, 17
%tmp2 = zext i32 %tmp1 to i64
diff --git a/llvm/test/CodeGen/X86/zext-shl.ll b/llvm/test/CodeGen/X86/zext-shl.ll
index bc0981781df8ff..8c27e0da6acf75 100644
--- a/llvm/test/CodeGen/X86/zext-shl.ll
+++ b/llvm/test/CodeGen/X86/zext-shl.ll
@@ -51,7 +51,7 @@ define i64 @i64_zext_shift_i16_zext_i8(i8 %a0) nounwind {
; X64-LABEL: i64_zext_shift_i16_zext_i8:
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: shll $5, %eax
+; X64-NEXT: shlq $5, %rax
; X64-NEXT: retq
%t0 = zext i8 %a0 to i16
%t1 = shl i16 %t0, 5
@@ -112,7 +112,7 @@ define i128 @i128_zext_shift_i64_zext_i8(i8 %a0) nounwind {
; X64-LABEL: i128_zext_shift_i64_zext_i8:
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: shll $4, %eax
+; X64-NEXT: shlq $4, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
%t0 = zext i8 %a0 to i64
@@ -136,7 +136,7 @@ define i128 @i128_zext_shift_i64_zext_i16(i16 %a0) nounwind {
; X64-LABEL: i128_zext_shift_i64_zext_i16:
; X64: # %bb.0:
; X64-NEXT: movzwl %di, %eax
-; X64-NEXT: shll $7, %eax
+; X64-NEXT: shlq $7, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
%t0 = zext i16 %a0 to i64
More information about the llvm-commits
mailing list